mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 09:10:57 +00:00
Compare commits
2 Commits
master
...
fix_vs_sdl
Author | SHA1 | Date | |
---|---|---|---|
00ddb10fe2 | |||
b3a6018bbf |
@ -13,6 +13,8 @@ WORKDIR /app
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git \
|
||||
@ -23,8 +25,7 @@ ENV CUDA_MAIN_VERSION=12.3
|
||||
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||
|
||||
COPY .. .
|
||||
# Enable cuBLAS
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1"
|
||||
RUN make base.en
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
ENV CUDA_MAIN_VERSION=12.3
|
||||
@ -36,5 +37,4 @@ RUN apt-get update && \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
@ -1,28 +0,0 @@
|
||||
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY .. .
|
||||
# Enable SYCL
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
make base.en CMAKE_ARGS="-DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16}"
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
@ -1,39 +0,0 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.0.1
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the MUSA runtime image
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
|
||||
|
||||
COPY .. .
|
||||
# Enable muBLAS
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_MUSA=1"
|
||||
|
||||
RUN find /app/build -name "*.o" -delete && \
|
||||
find /app/build -name "*.a" -delete && \
|
||||
rm -rf /app/build/CMakeFiles && \
|
||||
rm -rf /app/build/cmake_install.cmake && \
|
||||
rm -rf /app/build/_deps
|
||||
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg wget cmake git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
RUN du -sh /app/*
|
||||
RUN find /app -type f -size +100M
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
@ -16,5 +16,4 @@ RUN apt-get update && \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
@ -1,3 +0,0 @@
|
||||
build*/
|
||||
.github/
|
||||
.devops/
|
44
.github/workflows/bindings-ruby.yml
vendored
44
.github/workflows/bindings-ruby.yml
vendored
@ -1,11 +1,45 @@
|
||||
name: Bindings Tests (Ruby)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/dr_wav.h
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/dr_wav.h
|
||||
|
||||
jobs:
|
||||
ubuntu-22:
|
||||
@ -16,6 +50,6 @@ jobs:
|
||||
steps:
|
||||
- uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: '3.2'
|
||||
ruby-version: '3.1'
|
||||
- uses: actions/checkout@v4
|
||||
- run: rake test
|
||||
|
751
.github/workflows/build.yml
vendored
751
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
4
.github/workflows/docker.yml
vendored
4
.github/workflows/docker.yml
vendored
@ -18,8 +18,6 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-musa", dockerfile: ".devops/main-musa.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-intel", dockerfile: ".devops/main-intel.Dockerfile", platform: "linux/amd64" }
|
||||
#TODO: the cuda image keeps failing - disable for now
|
||||
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
|
||||
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
@ -30,8 +28,6 @@ jobs:
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
91
.github/workflows/examples-wasm.yml
vendored
91
.github/workflows/examples-wasm.yml
vendored
@ -1,91 +0,0 @@
|
||||
name: Examples WASM
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: "pages"
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy-wasm-github-pages:
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v4
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
||||
- name: Build WASM Examples
|
||||
# Enable for real build later in whisper.cpp
|
||||
run: |
|
||||
mkdir -p build-em && cd build-em
|
||||
emcmake cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
make -j
|
||||
|
||||
- name: Create staging directory
|
||||
run: mkdir -p staging
|
||||
|
||||
- name: Create .nojekyll file in staging directory
|
||||
run: touch staging/.nojekyll
|
||||
|
||||
- name: Copy application files
|
||||
run: |
|
||||
build_dir=build-em/bin
|
||||
|
||||
ls ${build_dir}
|
||||
|
||||
# command.wasm
|
||||
target_dir=staging/command.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/command.wasm/{index.html,command.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libcommand.js ${target_dir}
|
||||
|
||||
# bench.wasm
|
||||
target_dir=staging/bench.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/bench.wasm/{index.html,bench.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libbench.js ${target_dir}
|
||||
|
||||
# stream.wasm
|
||||
target_dir=staging/stream.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/stream.wasm/{index.html,stream.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libstream.js ${target_dir}
|
||||
|
||||
# whisper.wasm (this will be the main example page)
|
||||
target_dir=staging
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/whisper.wasm/{index.html,main.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libmain.js ${target_dir}
|
||||
|
||||
# Copy Cross-Origin Isolation service worker
|
||||
cp -v examples/coi-serviceworker.js staging/
|
||||
|
||||
- name: List files in staging directory (for debugging)
|
||||
run: |
|
||||
echo "Files in staging directory:"
|
||||
find staging -type f | sort
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
path: ./staging
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
5
.gitignore
vendored
5
.gitignore
vendored
@ -14,7 +14,6 @@
|
||||
|
||||
build/
|
||||
build-*/
|
||||
build_*/
|
||||
|
||||
# SPM
|
||||
.build/
|
||||
@ -50,8 +49,6 @@ extra/bench-gg.txt
|
||||
models/*.mlmodel
|
||||
models/*.mlmodelc
|
||||
models/*.mlpackage
|
||||
models/*-encoder-openvino.xml
|
||||
models/*-encoder-openvino-cache/
|
||||
bindings/java/.gradle/
|
||||
bindings/java/.idea/
|
||||
.idea/
|
||||
@ -61,5 +58,3 @@ cmake-build-debug/
|
||||
.cxx/
|
||||
.gradle/
|
||||
local.properties
|
||||
.log
|
||||
.exe
|
0
.gitmodules
vendored
Normal file
0
.gitmodules
vendored
Normal file
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.5)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
@ -38,13 +38,8 @@ if (EMSCRIPTEN)
|
||||
|
||||
# TODO: without these, we get the following error:
|
||||
# wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s TOTAL_STACK=5242880")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s TOTAL_STACK=5242880")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
else()
|
||||
if (MINGW)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
@ -59,13 +54,15 @@ option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
# option list
|
||||
#
|
||||
|
||||
# general
|
||||
option(WHISPER_CCACHE "whisper: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
|
||||
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
|
||||
|
||||
# build
|
||||
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
|
||||
option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)
|
||||
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
|
||||
|
||||
# sanitizers
|
||||
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
|
||||
@ -93,6 +90,7 @@ option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_CCACHE ${WHISPER_CCACHE})
|
||||
set(GGML_SANITIZE_THREAD ${WHISPER_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${WHISPER_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
|
||||
@ -117,43 +115,13 @@ whisper_option_depr(WARNING WHISPER_OPENMP GGML_OPENMP)
|
||||
whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
|
||||
whisper_option_depr(WARNING WHISPER_CCACHE GGML_CCACHE)
|
||||
|
||||
if (GGML_CUDA AND NOT MSVC)
|
||||
#GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets
|
||||
add_compile_options(-Wno-deprecated-gpu-targets)
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
if (WHISPER_USE_SYSTEM_GGML)
|
||||
find_package(ggml REQUIRED)
|
||||
if (NOT ggml_FOUND)
|
||||
message(FATAL_ERROR "System-installed GGML library not found.")
|
||||
endif()
|
||||
add_library(ggml ALIAS ggml::ggml)
|
||||
else()
|
||||
add_subdirectory(ggml)
|
||||
if(WIN32)
|
||||
# The following adds a _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR macro and is a workaround for
|
||||
# the Windows C++ standard library which does not support constexpr mutexes.
|
||||
# From the release notes://github.com/microsoft/STL/wiki/Changelog
|
||||
# Disable constexpr mutex constructor on Windows
|
||||
# Fixed mutex's constructor to be constexpr. #3824 #4000 #4339
|
||||
# Note: Programs that aren't following the documented restrictions on binary compatibility may encounter
|
||||
# null dereferences in mutex machinery. You must follow this rule:
|
||||
# When you mix binaries built by different supported versions of the toolset, the Redistributable version
|
||||
# must be at least as new as the latest toolset used by any app component.
|
||||
# You can define _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR as an escape hatch.
|
||||
#
|
||||
# Specifically to whisper.cpp this would cause a crash when using the Java bindings.
|
||||
# resulting in a Invalid memory access error.
|
||||
target_compile_definitions(ggml-base PRIVATE _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
|
||||
endif()
|
||||
endif()
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
add_subdirectory(src)
|
||||
@ -208,44 +176,10 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
#include(CTest)
|
||||
#add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
|
||||
if (MSVC)
|
||||
set(MSVC_WARNING_FLAGS
|
||||
/wd4101 # Unreferenced local variable
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4065 # switch statement contains 'default' but no 'case' labels
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4244 # Conversion from one type to another type, possible loss of ata
|
||||
/wd4805 # Unsafe mix of type
|
||||
/wd4305 # Truncation from 'type1' to 'type2' (often double to float)
|
||||
/wd4996 # Function or variable may be unsafe/deprecated
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
disable_msvc_warnings(whisper)
|
||||
disable_msvc_warnings(common)
|
||||
disable_msvc_warnings(common-sdl)
|
||||
disable_msvc_warnings(lsp)
|
||||
disable_msvc_warnings(wchess-core)
|
||||
disable_msvc_warnings(whisper-command)
|
||||
disable_msvc_warnings(whisper-cli)
|
||||
disable_msvc_warnings(whisper-server)
|
||||
disable_msvc_warnings(whisper-stream)
|
||||
disable_msvc_warnings(whisper-talk-llama)
|
||||
disable_msvc_warnings(whisper-bench)
|
||||
disable_msvc_warnings(quantize)
|
||||
disable_msvc_warnings(vad-speech-segments)
|
||||
endif()
|
||||
endif()
|
||||
|
19
Makefile
19
Makefile
@ -4,7 +4,7 @@
|
||||
|
||||
.PHONY: build
|
||||
build:
|
||||
cmake -B build $(CMAKE_ARGS)
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
|
||||
# download a few audio samples into folder "./samples":
|
||||
@ -18,6 +18,17 @@ samples:
|
||||
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
|
||||
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
|
||||
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
|
||||
@echo "Converting to 16-bit WAV ..."
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
|
||||
@rm samples/*.ogg
|
||||
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
|
||||
@rm samples/mm1.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
|
||||
@rm samples/a13.mp3
|
||||
@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
|
||||
@rm samples/diffusion2023-07-03.flac
|
||||
|
||||
#
|
||||
# Models
|
||||
@ -41,17 +52,17 @@ samples:
|
||||
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo:
|
||||
bash ./models/download-ggml-model.sh $@
|
||||
cmake -B build $(CMAKE_ARGS)
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
@echo ""
|
||||
@echo "==============================================="
|
||||
@echo "Running $@ on all samples in ./samples ..."
|
||||
@echo "==============================================="
|
||||
@echo ""
|
||||
@for f in samples/*.{flac,mp3,ogg,wav}; do \
|
||||
@for f in samples/*.wav; do \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "" ; \
|
||||
./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
|
||||
echo "" ; \
|
||||
|
19
Package.swift
Normal file
19
Package.swift
Normal file
@ -0,0 +1,19 @@
|
||||
// swift-tools-version:5.5
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "whisper",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.iOS(.v14),
|
||||
.watchOS(.v4),
|
||||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "whisper", targets: ["whisper"]),
|
||||
],
|
||||
targets: [
|
||||
.systemLibrary(name: "whisper", pkgConfig: "whisper"),
|
||||
]
|
||||
)
|
249
README.md
249
README.md
@ -2,12 +2,15 @@
|
||||
|
||||

|
||||
|
||||
[](https://github.com/ggml-org/whisper.cpp/actions)
|
||||
[](https://github.com/ggerganov/whisper.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.7.5](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.5) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/)
|
||||
> [!NOTE]
|
||||
> New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788
|
||||
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
@ -23,9 +26,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
||||
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
|
||||
- [OpenVINO Support](#openvino-support)
|
||||
- [Ascend NPU Support](#ascend-npu-support)
|
||||
- [Moore Threads GPU Support](#moore-threads-gpu-support)
|
||||
- [C-style API](https://github.com/ggml-org/whisper.cpp/blob/master/include/whisper.h)
|
||||
- [Voice Activity Detection (VAD)](#voice-activity-detection-vad)
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
|
||||
|
||||
Supported platforms:
|
||||
|
||||
@ -33,14 +34,14 @@ Supported platforms:
|
||||
- [x] [iOS](examples/whisper.objc)
|
||||
- [x] [Android](examples/whisper.android)
|
||||
- [x] [Java](bindings/java/README.md)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggml-org/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] [WebAssembly](examples/whisper.wasm)
|
||||
- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168))
|
||||
- [x] [Raspberry Pi](https://github.com/ggml-org/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggml-org/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
|
||||
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggml-org/ggml) machine learning library.
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
|
||||
|
||||
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
|
||||
@ -53,14 +54,14 @@ https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a
|
||||
|
||||
On Apple Silicon, the inference runs fully on the GPU via Metal:
|
||||
|
||||
https://github.com/ggml-org/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
|
||||
## Quick start
|
||||
|
||||
First clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
```
|
||||
|
||||
Navigate into the directory:
|
||||
@ -151,7 +152,6 @@ standard cmake setup with:
|
||||
cmake -B build -DGGML_BLAS=1
|
||||
cmake --build build --config Release
|
||||
./build/bin/whisper-cli [ .. etc .. ]
|
||||
```
|
||||
|
||||
## Quantization
|
||||
|
||||
@ -184,11 +184,11 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
||||
```
|
||||
|
||||
- To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
|
||||
- Python 3.11 is recommended.
|
||||
- Python 3.10 is recommended.
|
||||
- MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
|
||||
- [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
|
||||
- To create an environment, use: `conda create -n py311-whisper python=3.11 -y`
|
||||
- To activate the environment, use: `conda activate py311-whisper`
|
||||
- To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
|
||||
- To activate the environment, use: `conda activate py310-whisper`
|
||||
|
||||
- Generate a Core ML model. For example, to generate a `base.en` model, use:
|
||||
|
||||
@ -225,7 +225,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
||||
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
|
||||
Next runs are faster.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggml-org/whisper.cpp/pull/566).
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## OpenVINO support
|
||||
|
||||
@ -267,7 +267,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
||||
|
||||
- Build `whisper.cpp` with OpenVINO support:
|
||||
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2024.6.0](https://github.com/openvinotoolkit/openvino/releases/tag/2024.6.0). Ready to use Binaries of the required libraries can be found in the [OpenVino Archives](https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/)
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
|
||||
|
||||
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
|
||||
|
||||
@ -310,7 +310,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
||||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
@ -324,12 +324,6 @@ cmake -B build -DGGML_CUDA=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
or for newer NVIDIA GPU's (RTX 5000 series):
|
||||
```
|
||||
cmake -B build -DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## Vulkan GPU support
|
||||
Cross-vendor solution which allows you to accelerate workload on your GPU.
|
||||
First, make sure your graphics card driver provides support for Vulkan API.
|
||||
@ -383,56 +377,6 @@ Run the inference examples as usual, for example:
|
||||
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
|
||||
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
|
||||
|
||||
## Moore Threads GPU support
|
||||
|
||||
With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels.
|
||||
First, make sure you have installed `MUSA SDK rc4.0.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.0.1
|
||||
|
||||
Now build `whisper.cpp` with MUSA support:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_MUSA=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
or specify the architecture for your Moore Threads GPU. For example, if you have a MTT S80 GPU, you can specify the architecture as follows:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_MUSA=1 -DMUSA_ARCHITECTURES="21"
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## FFmpeg support (Linux only)
|
||||
|
||||
If you want to support more audio formats (such as Opus and AAC), you can turn on the `WHISPER_FFMPEG` build flag to enable FFmpeg integration.
|
||||
|
||||
First, you need to install required libraries:
|
||||
|
||||
```bash
|
||||
# Debian/Ubuntu
|
||||
sudo apt install libavcodec-dev libavformat-dev libavutil-dev
|
||||
|
||||
# RHEL/Fedora
|
||||
sudo dnf install libavcodec-free-devel libavformat-free-devel libavutil-free-devel
|
||||
```
|
||||
|
||||
Then you can build the project as follows:
|
||||
|
||||
```bash
|
||||
cmake -B build -D WHISPER_FFMPEG=yes
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
Run the following example to confirm it's working:
|
||||
|
||||
```bash
|
||||
# Convert an audio file to Opus format
|
||||
ffmpeg -i samples/jfk.wav jfk.opus
|
||||
|
||||
# Transcribe the audio file
|
||||
./build/bin/whisper-cli --model models/ggml-base.en.bin --file jfk.opus
|
||||
```
|
||||
|
||||
## Docker
|
||||
|
||||
### Prerequisites
|
||||
@ -444,9 +388,8 @@ ffmpeg -i samples/jfk.wav jfk.opus
|
||||
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggml-org/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggml-org/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
3. `ghcr.io/ggml-org/whisper.cpp:main-musa`: Same as `main` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
|
||||
### Usage
|
||||
|
||||
@ -459,11 +402,11 @@ docker run -it --rm \
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
-v path/to/audios:/audios \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||
whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||
# transcribe an audio file in samples folder
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||
whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||
```
|
||||
|
||||
## Installing with Conan
|
||||
@ -484,8 +427,7 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume
|
||||
|
||||
This is a naive example of performing real-time inference on audio from your microphone.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
|
||||
More info is available in [issue #10](https://github.com/ggml-org/whisper.cpp/issues/10).
|
||||
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
@ -573,7 +515,7 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
|
||||
|
||||
## Speaker segmentation via tinydiarize (experimental)
|
||||
|
||||
More information about this approach is available here: https://github.com/ggml-org/whisper.cpp/pull/1058
|
||||
More information about this approach is available here: https://github.com/ggerganov/whisper.cpp/pull/1058
|
||||
|
||||
Sample usage:
|
||||
|
||||
@ -600,7 +542,7 @@ main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 pr
|
||||
## Karaoke-style movie generation (experimental)
|
||||
|
||||
The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
|
||||
currently pronounced word is highlighted. Use the `-owts` argument and run the generated bash script.
|
||||
currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
|
||||
This requires to have `ffmpeg` installed.
|
||||
|
||||
Here are a few _"typical"_ examples:
|
||||
@ -637,7 +579,7 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
|
||||
|
||||
## Video comparison of different models
|
||||
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggml-org/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
|
||||
```bash
|
||||
./scripts/bench-wts.sh samples/jfk.wav
|
||||
@ -654,7 +596,7 @@ In order to have an objective comparison of the performance of the inference acr
|
||||
use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
|
||||
took to execute it. The results are summarized in the following Github issue:
|
||||
|
||||
[Benchmark results](https://github.com/ggml-org/whisper.cpp/issues/89)
|
||||
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
|
||||
|
||||
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
|
||||
|
||||
@ -681,24 +623,25 @@ You can download the converted models using the [models/download-ggml-model.sh](
|
||||
or manually from here:
|
||||
|
||||
- https://huggingface.co/ggerganov/whisper.cpp
|
||||
- https://ggml.ggerganov.com
|
||||
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
|
||||
|
||||
## [Bindings](https://github.com/ggml-org/whisper.cpp/discussions/categories/bindings)
|
||||
## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)
|
||||
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggml-org/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggml-org/whisper.cpp/discussions/309)
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
|
||||
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [x] Java:
|
||||
- [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggml-org/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggml-org/whisper.spm](https://github.com/ggml-org/whisper.spm) | [#313](https://github.com/ggml-org/whisper.cpp/discussions/313)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
|
||||
- [x] .NET: | [#422](https://github.com/ggml-org/whisper.cpp/discussions/422)
|
||||
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
- [x] Python: | [#9](https://github.com/ggml-org/whisper.cpp/issues/9)
|
||||
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
|
||||
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
|
||||
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
|
||||
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
|
||||
@ -706,116 +649,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
|
||||
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
|
||||
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
|
||||
|
||||
## XCFramework
|
||||
The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
|
||||
and macOS. It can be used in Swift projects without the need to compile the
|
||||
library from source. For examples:
|
||||
```swift
|
||||
// swift-tools-version: 5.10
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "Whisper",
|
||||
targets: [
|
||||
.executableTarget(
|
||||
name: "Whisper",
|
||||
dependencies: [
|
||||
"WhisperFramework"
|
||||
]),
|
||||
.binaryTarget(
|
||||
name: "WhisperFramework",
|
||||
url: "https://github.com/ggml-org/whisper.cpp/releases/download/v1.7.5/whisper-v1.7.5-xcframework.zip",
|
||||
checksum: "c7faeb328620d6012e130f3d705c51a6ea6c995605f2df50f6e1ad68c59c6c4a"
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Voice Activity Detection (VAD)
|
||||
Support for Voice Activity Detection (VAD) can be enabled using the `--vad`
|
||||
argument to `whisper-cli`. In addition to this option a VAD model is also
|
||||
required.
|
||||
|
||||
The way this works is that first the audio samples are passed through
|
||||
the VAD model which will detect speech segments. Using this information the
|
||||
only the speech segments that are detected are extracted from the original audio
|
||||
input and passed to whisper for processing. This reduces the amount of audio
|
||||
data that needs to be processed by whisper and can significantly speed up the
|
||||
transcription process.
|
||||
|
||||
The following VAD models are currently supported:
|
||||
|
||||
### Silero-VAD
|
||||
[Silero-vad](https://github.com/snakers4/silero-vad) is a lightweight VAD model
|
||||
written in Python that is fast and accurate.
|
||||
|
||||
Models can be downloaded by running the following command on Linux or MacOS:
|
||||
```console
|
||||
$ ./models/download-vad-model.sh silero-v5.1.2
|
||||
Downloading ggml model silero-v5.1.2 from 'https://huggingface.co/ggml-org/whisper-vad' ...
|
||||
ggml-silero-v5.1.2.bin 100%[==============================================>] 864.35K --.-KB/s in 0.04s
|
||||
Done! Model 'silero-v5.1.2' saved in '/path/models/ggml-silero-v5.1.2.bin'
|
||||
You can now use it like this:
|
||||
|
||||
$ ./build/bin/whisper-cli -vm /path/models/ggml-silero-v5.1.2.bin --vad -f samples/jfk.wav -m models/ggml-base.en.bin
|
||||
|
||||
```
|
||||
And the following command on Windows:
|
||||
```console
|
||||
> .\models\download-vad-model.cmd silero-v5.1.2
|
||||
Downloading vad model silero-v5.1.2...
|
||||
Done! Model silero-v5.1.2 saved in C:\Users\danie\work\ai\whisper.cpp\ggml-silero-v5.1.2.bin
|
||||
You can now use it like this:
|
||||
|
||||
C:\path\build\bin\Release\whisper-cli.exe -vm C:\path\ggml-silero-v5.1.2.bin --vad -m models/ggml-base.en.bin -f samples\jfk.wav
|
||||
|
||||
```
|
||||
|
||||
To see a list of all available models, run the above commands without any
|
||||
arguments.
|
||||
|
||||
This model can be also be converted manually to ggml using the following command:
|
||||
```console
|
||||
$ python3 -m venv venv && source venv/bin/activate
|
||||
$ (venv) pip install silero-vad
|
||||
$ (venv) $ python models/convert-silero-vad-to-ggml.py --output models/silero.bin
|
||||
Saving GGML Silero-VAD model to models/silero-v5.1.2-ggml.bin
|
||||
```
|
||||
And it can then be used with whisper as follows:
|
||||
```console
|
||||
$ ./build/bin/whisper-cli \
|
||||
--file ./samples/jfk.wav \
|
||||
--model ./models/ggml-base.en.bin \
|
||||
--vad \
|
||||
--vad-model ./models/silero-v5.1.2-ggml.bin
|
||||
```
|
||||
|
||||
### VAD Options
|
||||
|
||||
* --vad-threshold: Threshold probability for speech detection. A probability
|
||||
for a speech segment/frame above this threshold will be considered as speech.
|
||||
|
||||
* --vad-min-speech-duration-ms: Minimum speech duration in milliseconds. Speech
|
||||
segments shorter than this value will be discarded to filter out brief noise or
|
||||
false positives.
|
||||
|
||||
* --vad-min-silence-duration-ms: Minimum silence duration in milliseconds. Silence
|
||||
periods must be at least this long to end a speech segment. Shorter silence
|
||||
periods will be ignored and included as part of the speech.
|
||||
|
||||
* --vad-max-speech-duration-s: Maximum speech duration in seconds. Speech segments
|
||||
longer than this will be automatically split into multiple segments at silence
|
||||
points exceeding 98ms to prevent excessively long segments.
|
||||
|
||||
* --vad-speech-pad-ms: Speech padding in milliseconds. Adds this amount of padding
|
||||
before and after each detected speech segment to avoid cutting off speech edges.
|
||||
|
||||
* --vad-samples-overlap: Amount of audio to extend from each speech segment into
|
||||
the next one, in seconds (e.g., 0.10 = 100ms overlap). This ensures speech isn't
|
||||
cut off abruptly between segments when they're concatenated together.
|
||||
|
||||
## Examples
|
||||
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder.
|
||||
@ -834,13 +667,13 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
|
||||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggml-org/whisper.cpp/issues/185) |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
|
||||
|
||||
## [Discussions](https://github.com/ggml-org/whisper.cpp/discussions)
|
||||
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
|
||||
|
||||
If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
|
||||
You can use the [Show and tell](https://github.com/ggml-org/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
|
||||
[Frequently asked questions (#126)](https://github.com/ggml-org/whisper.cpp/discussions/126) discussion.
|
||||
[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.
|
||||
|
498
README_sycl.md
498
README_sycl.md
@ -1,249 +1,249 @@
|
||||
# whisper.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The whisper.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute whisper.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_whisper.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|
||||
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
# whisper.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<EFBFBD>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The whisper.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel<EFBFBD> oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel<EFBFBD> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute whisper.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_whisper.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|
||||
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
5
Sources/whisper/module.modulemap
Normal file
5
Sources/whisper/module.modulemap
Normal file
@ -0,0 +1,5 @@
|
||||
module whisper [system] {
|
||||
header "whisper.h"
|
||||
link "whisper"
|
||||
export *
|
||||
}
|
4
Sources/whisper/whisper.h
Normal file
4
Sources/whisper/whisper.h
Normal file
@ -0,0 +1,4 @@
|
||||
#pragma once
|
||||
|
||||
#include <whisper.h>
|
||||
|
@ -11,11 +11,11 @@ UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
GGML_METAL_PATH_RESOURCES := $(abspath ../..)
|
||||
BUILD_DIR := build_go
|
||||
BUILD_DIR := build
|
||||
MODELS_DIR := models
|
||||
EXAMPLES_DIR := $(wildcard examples/*)
|
||||
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
|
||||
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))
|
||||
LIBRARY_PATH := $(abspath ../..)
|
||||
|
||||
ifeq ($(GGML_CUDA),1)
|
||||
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
|
||||
@ -29,10 +29,8 @@ endif
|
||||
all: clean whisper examples
|
||||
|
||||
whisper: mkdir
|
||||
cmake -S ../.. -B ../../${BUILD_DIR} \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build ../../${BUILD_DIR} --target whisper
|
||||
@echo Build whisper
|
||||
@${MAKE} -C ../.. libwhisper.a
|
||||
|
||||
test: model-small whisper modtidy
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
|
@ -31,7 +31,7 @@ func main() {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := context.Process(samples, nil, nil, nil); err != nil {
|
||||
if err := context.Process(samples, nil, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ func main() {
|
||||
In order to build, you need to have the Go compiler installed. You can get it from [here](https://golang.org/dl/). Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/go
|
||||
make test
|
||||
```
|
||||
@ -98,7 +98,7 @@ The API Documentation:
|
||||
|
||||
Getting help:
|
||||
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
|
||||
## License
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
github.com/ggml-org/whisper.cpp/bindings/go
|
||||
github.com/ggerganov/whisper.cpp/bindings/go
|
||||
provides a speech-to-text service bindings for the Go programming language.
|
||||
*/
|
||||
package whisper
|
||||
|
@ -9,23 +9,22 @@ import (
|
||||
// ContextForSignal returns a context object which is cancelled when a signal
|
||||
// is received. It returns nil if no signal parameter is provided
|
||||
func ContextForSignal(signals ...os.Signal) context.Context {
|
||||
if len(signals) == 0 {
|
||||
return nil
|
||||
}
|
||||
if len(signals) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ch := make(chan os.Signal, 1) // Buffered channel with space for 1 signal
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
ch := make(chan os.Signal)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Send message on channel when signal received
|
||||
signal.Notify(ch, signals...)
|
||||
// Send message on channel when signal received
|
||||
signal.Notify(ch, signals...)
|
||||
|
||||
// When any signal is received, call cancel
|
||||
go func() {
|
||||
<-ch
|
||||
cancel()
|
||||
}()
|
||||
// When any signal received, call cancel
|
||||
go func() {
|
||||
<-ch
|
||||
cancel()
|
||||
}()
|
||||
|
||||
// Return success
|
||||
return ctx
|
||||
// Return success
|
||||
return ctx
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,6 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
@ -18,27 +17,14 @@ import (
|
||||
// CONSTANTS
|
||||
|
||||
const (
|
||||
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/" // The location of the models
|
||||
srcExt = ".bin" // Filename extension
|
||||
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
|
||||
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
|
||||
srcExt = ".bin" // Filename extension
|
||||
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
|
||||
)
|
||||
|
||||
var (
|
||||
// The models which will be downloaded, if no model is specified as an argument
|
||||
modelNames = []string{
|
||||
"tiny", "tiny-q5_1", "tiny-q8_0",
|
||||
"tiny.en", "tiny.en-q5_1", "tiny.en-q8_0",
|
||||
"base", "base-q5_1", "base-q8_0",
|
||||
"base.en", "base.en-q5_1", "base.en-q8_0",
|
||||
"small", "small-q5_1", "small-q8_0",
|
||||
"small.en", "small.en-q5_1", "small.en-q8_0",
|
||||
"medium", "medium-q5_0", "medium-q8_0",
|
||||
"medium.en", "medium.en-q5_0", "medium.en-q8_0",
|
||||
"large-v1",
|
||||
"large-v2", "large-v2-q5_0", "large-v2-q8_0",
|
||||
"large-v3", "large-v3-q5_0",
|
||||
"large-v3-turbo", "large-v3-turbo-q5_0", "large-v3-turbo-q8_0",
|
||||
}
|
||||
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
|
||||
)
|
||||
|
||||
var (
|
||||
@ -58,25 +44,7 @@ var (
|
||||
func main() {
|
||||
flag.Usage = func() {
|
||||
name := filepath.Base(flag.CommandLine.Name())
|
||||
fmt.Fprintf(flag.CommandLine.Output(), `
|
||||
Usage: %s [options] [<model>...]
|
||||
|
||||
Options:
|
||||
-out string Specify the output folder where models will be saved.
|
||||
Default: Current working directory.
|
||||
-timeout duration Set the maximum duration for downloading a model.
|
||||
Example: 10m, 1h (default: 30m0s).
|
||||
-quiet Suppress all output except errors.
|
||||
|
||||
Examples:
|
||||
1. Download a specific model:
|
||||
%s -out ./models tiny-q8_0
|
||||
|
||||
2. Download all models:
|
||||
%s -out ./models
|
||||
|
||||
`, name, name, name)
|
||||
|
||||
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [options] <model>\n\n", name)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
flag.Parse()
|
||||
@ -146,87 +114,23 @@ func GetOut() (string, error) {
|
||||
// GetModels returns the list of models to download
|
||||
func GetModels() []string {
|
||||
if flag.NArg() == 0 {
|
||||
fmt.Println("No model specified.")
|
||||
fmt.Println("Preparing to download all models...")
|
||||
|
||||
// Calculate total download size
|
||||
fmt.Println("Calculating total download size...")
|
||||
totalSize, err := CalculateTotalDownloadSize(modelNames)
|
||||
if err != nil {
|
||||
fmt.Println("Error calculating download sizes:", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Println("View available models: https://huggingface.co/ggerganov/whisper.cpp/tree/main")
|
||||
fmt.Printf("Total download size: %.2f GB\n", float64(totalSize)/(1024*1024*1024))
|
||||
fmt.Println("Would you like to download all models? (y/N)")
|
||||
|
||||
// Prompt for user input
|
||||
var response string
|
||||
fmt.Scanln(&response)
|
||||
if response != "y" && response != "Y" {
|
||||
fmt.Println("Aborting. Specify a model to download.")
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
return modelNames // Return all models if confirmed
|
||||
return modelNames
|
||||
} else {
|
||||
return flag.Args()
|
||||
}
|
||||
return flag.Args() // Return specific models if arguments are provided
|
||||
}
|
||||
|
||||
func CalculateTotalDownloadSize(models []string) (int64, error) {
|
||||
var totalSize int64
|
||||
client := http.Client{}
|
||||
|
||||
for _, model := range models {
|
||||
modelURL, err := URLForModel(model)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// Issue a HEAD request to get the file size
|
||||
req, err := http.NewRequest("HEAD", modelURL, nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
fmt.Printf("Warning: Unable to fetch size for %s (HTTP %d)\n", model, resp.StatusCode)
|
||||
continue
|
||||
}
|
||||
|
||||
size := resp.ContentLength
|
||||
totalSize += size
|
||||
}
|
||||
return totalSize, nil
|
||||
}
|
||||
|
||||
// URLForModel returns the URL for the given model on huggingface.co
|
||||
func URLForModel(model string) (string, error) {
|
||||
// Ensure "ggml-" prefix is added only once
|
||||
if !strings.HasPrefix(model, "ggml-") {
|
||||
model = "ggml-" + model
|
||||
}
|
||||
|
||||
// Ensure ".bin" extension is added only once
|
||||
if filepath.Ext(model) != srcExt {
|
||||
model += srcExt
|
||||
}
|
||||
|
||||
// Parse the base URL
|
||||
url, err := url.Parse(srcUrl)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else {
|
||||
url.Path = filepath.Join(url.Path, model)
|
||||
}
|
||||
|
||||
// Ensure no trailing slash in the base URL
|
||||
url.Path = fmt.Sprintf("%s/%s", strings.TrimSuffix(url.Path, "/"), model)
|
||||
return url.String(), nil
|
||||
}
|
||||
|
||||
|
@ -67,7 +67,7 @@ func Process(model whisper.Model, path string, flags *Flags) error {
|
||||
// Process the data
|
||||
fmt.Fprintf(flags.Output(), " ...processing %q\n", path)
|
||||
context.ResetTimings()
|
||||
if err := context.Process(data, nil, cb, nil); err != nil {
|
||||
if err := context.Process(data, cb, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -71,10 +71,6 @@ func (context *context) Language() string {
|
||||
return whisper.Whisper_lang_str(context.params.Language())
|
||||
}
|
||||
|
||||
func (context *context) DetectedLanguage() string {
|
||||
return whisper.Whisper_lang_str(context.model.ctx.Whisper_full_lang_id())
|
||||
}
|
||||
|
||||
// Set translate flag
|
||||
func (context *context) SetTranslate(v bool) {
|
||||
context.params.SetTranslate(v)
|
||||
@ -193,7 +189,6 @@ func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]f
|
||||
// Process new sample data and return any errors
|
||||
func (context *context) Process(
|
||||
data []float32,
|
||||
callEncoderBegin EncoderBeginCallback,
|
||||
callNewSegment SegmentCallback,
|
||||
callProgress ProgressCallback,
|
||||
) error {
|
||||
@ -208,20 +203,7 @@ func (context *context) Process(
|
||||
// We don't do parallel processing at the moment
|
||||
processors := 0
|
||||
if processors > 1 {
|
||||
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, callEncoderBegin,
|
||||
func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := context.model.ctx.Whisper_full(context.params, data, callEncoderBegin,
|
||||
func(new int) {
|
||||
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
@ -229,11 +211,22 @@ func (context *context) Process(
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}, func(progress int) {
|
||||
if callProgress != nil {
|
||||
callProgress(progress)
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}, func(progress int) {
|
||||
if callProgress != nil {
|
||||
callProgress(progress)
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -88,37 +88,6 @@ func TestProcess(t *testing.T) {
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
err = context.Process(data, nil, nil, nil)
|
||||
err = context.Process(data, nil, nil)
|
||||
assert.NoError(err)
|
||||
}
|
||||
|
||||
func TestDetectedLanguage(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
fh, err := os.Open(SamplePath)
|
||||
assert.NoError(err)
|
||||
defer fh.Close()
|
||||
|
||||
// Decode the WAV file - load the full buffer
|
||||
dec := wav.NewDecoder(fh)
|
||||
buf, err := dec.FullPCMBuffer()
|
||||
assert.NoError(err)
|
||||
assert.Equal(uint16(1), dec.NumChans)
|
||||
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
err = context.Process(data, nil, nil, nil)
|
||||
assert.NoError(err)
|
||||
|
||||
expectedLanguage := "en"
|
||||
actualLanguage := context.DetectedLanguage()
|
||||
assert.Equal(expectedLanguage, actualLanguage)
|
||||
}
|
||||
|
@ -16,10 +16,6 @@ type SegmentCallback func(Segment)
|
||||
// processing. It is called during the Process function
|
||||
type ProgressCallback func(int)
|
||||
|
||||
// EncoderBeginCallback is the callback function for checking if we want to
|
||||
// continue processing. It is called during the Process function
|
||||
type EncoderBeginCallback func() bool
|
||||
|
||||
// Model is the interface to a whisper model. Create a new model with the
|
||||
// function whisper.New(string)
|
||||
type Model interface {
|
||||
@ -35,13 +31,12 @@ type Model interface {
|
||||
Languages() []string
|
||||
}
|
||||
|
||||
// Context is the speech recognition context.
|
||||
// Context is the speach recognition context.
|
||||
type Context interface {
|
||||
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
|
||||
SetTranslate(bool) // Set translate flag
|
||||
IsMultilingual() bool // Return true if the model is multilingual.
|
||||
Language() string // Get language
|
||||
DetectedLanguage() string // Get detected language
|
||||
|
||||
SetOffset(time.Duration) // Set offset
|
||||
SetDuration(time.Duration) // Set duration
|
||||
@ -63,7 +58,7 @@ type Context interface {
|
||||
// Process mono audio data and return any errors.
|
||||
// If defined, newly generated segments are passed to the
|
||||
// callback function during processing.
|
||||
Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error
|
||||
Process([]float32, SegmentCallback, ProgressCallback) error
|
||||
|
||||
// After process is called, return segments until the end of the stream
|
||||
// is reached, when io.EOF is returned.
|
||||
|
@ -9,7 +9,7 @@ import (
|
||||
// CGO
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp
|
||||
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
|
||||
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
|
||||
#include <whisper.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -31,10 +31,10 @@ public class Example {
|
||||
var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
// custom configuration if required
|
||||
whisperParams.temperature_inc = 0f;
|
||||
|
||||
|
||||
var samples = readAudio(); // divide each value by 32767.0f
|
||||
whisper.fullTranscribe(whisperParams, samples);
|
||||
|
||||
|
||||
int segmentCount = whisper.getTextSegmentCount(context);
|
||||
for (int i = 0; i < segmentCount; i++) {
|
||||
String text = whisper.getTextSegment(context, i);
|
||||
@ -52,7 +52,7 @@ public class Example {
|
||||
In order to build, you need to have the JDK 8 or higher installed. Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/java
|
||||
|
||||
./gradlew build
|
||||
|
@ -25,43 +25,25 @@ sourceSets {
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperDynlib', Copy) {
|
||||
from '../../build/src'
|
||||
include 'libwhisper.dylib'
|
||||
into 'build/generated/resources/main'
|
||||
from '../../build'
|
||||
include 'libwhisper.dynlib'
|
||||
into 'build/generated/resources/main/darwin'
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperSo', Copy) {
|
||||
from '../../build/src'
|
||||
from '../../build'
|
||||
include 'libwhisper.so'
|
||||
into 'build/generated/resources/main'
|
||||
into 'build/generated/resources/main/linux-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyWhisperDLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
tasks.register('copyWhisperDll', Copy) {
|
||||
from '../../build/Release'
|
||||
include 'whisper.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_BASE_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml-base.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_CPU_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml-cpu.dll'
|
||||
into 'build/generated/resources/main'
|
||||
into 'build/generated/resources/main/windows-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyLibs') {
|
||||
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDLL, copyGGML_BASE_DLL, copyGGML_DLL, copyGGML_CPU_DLL
|
||||
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDll
|
||||
}
|
||||
|
||||
test {
|
||||
@ -73,12 +55,7 @@ java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
sourcesJar() {
|
||||
dependsOn copyLibs
|
||||
}
|
||||
|
||||
jar {
|
||||
dependsOn copyLibs
|
||||
exclude '**/whisper_java.exp', '**/whisper_java.lib'
|
||||
}
|
||||
|
||||
@ -90,9 +67,6 @@ tasks.withType(Test) {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
test.dependsOn copyLibs
|
||||
processResources.dependsOn copyLibs
|
||||
|
||||
dependencies {
|
||||
implementation "net.java.dev.jna:jna:5.13.0"
|
||||
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
|
||||
|
0
bindings/java/gradlew
vendored
Executable file → Normal file
0
bindings/java/gradlew
vendored
Executable file → Normal file
@ -1,24 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
/**
|
||||
* Presets for alignment heads in DTW token timestamps
|
||||
*/
|
||||
public class WhisperConstants {
|
||||
// Alignment heads presets
|
||||
public static final int WHISPER_AHEADS_NONE = 0;
|
||||
public static final int WHISPER_AHEADS_TINY_EN = 1;
|
||||
public static final int WHISPER_AHEADS_TINY = 2;
|
||||
public static final int WHISPER_AHEADS_BASE_EN = 3;
|
||||
public static final int WHISPER_AHEADS_BASE = 4;
|
||||
public static final int WHISPER_AHEADS_SMALL_EN = 5;
|
||||
public static final int WHISPER_AHEADS_SMALL = 6;
|
||||
public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
|
||||
public static final int WHISPER_AHEADS_MEDIUM = 8;
|
||||
public static final int WHISPER_AHEADS_LARGE_V1 = 9;
|
||||
public static final int WHISPER_AHEADS_LARGE_V2 = 10;
|
||||
public static final int WHISPER_AHEADS_LARGE_V3 = 11;
|
||||
public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
|
||||
public static final int WHISPER_AHEADS_CUSTOM = 13;
|
||||
public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
|
||||
public static final int WHISPER_AHEADS_COUNT = 15;
|
||||
}
|
@ -1,9 +1,7 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import com.sun.jna.NativeLong;
|
||||
import com.sun.jna.Structure;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.ggml.GgmlType;
|
||||
import io.github.ggerganov.whispercpp.WhisperModel;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
@ -11,26 +9,33 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperContext extends Structure {
|
||||
public NativeLong t_load_us;
|
||||
public NativeLong t_start_us;
|
||||
int t_load_us = 0;
|
||||
int t_start_us = 0;
|
||||
|
||||
/** weight type (FP32 / FP16 / QX) */
|
||||
public GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
||||
GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
||||
/** intermediate type (FP32 or FP16) */
|
||||
public GgmlType itype = GgmlType.GGML_TYPE_F16;
|
||||
GgmlType itype = GgmlType.GGML_TYPE_F16;
|
||||
|
||||
public WhisperContextParams.ByValue params;
|
||||
|
||||
public Pointer model;
|
||||
public Pointer vocab;
|
||||
public Pointer state;
|
||||
// WhisperModel model;
|
||||
public PointerByReference model;
|
||||
// whisper_vocab vocab;
|
||||
// whisper_state * state = nullptr;
|
||||
public PointerByReference vocab;
|
||||
public PointerByReference state;
|
||||
|
||||
/** populated by whisper_init_from_file_with_params() */
|
||||
public Pointer path_model;
|
||||
String path_model;
|
||||
WhisperContextParams params;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return List.of("t_load_us", "t_start_us", "wtype", "itype",
|
||||
"params", "model", "vocab", "state", "path_model");
|
||||
}
|
||||
// public static class ByReference extends WhisperContext implements Structure.ByReference {
|
||||
// }
|
||||
//
|
||||
// public static class ByValue extends WhisperContext implements Structure.ByValue {
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// protected List<String> getFieldOrder() {
|
||||
// return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
|
||||
// }
|
||||
}
|
||||
|
@ -43,11 +43,11 @@ public class WhisperCpp implements AutoCloseable {
|
||||
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
||||
* @param params - params to use when initialising the context
|
||||
*/
|
||||
public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
||||
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
initContextImpl(modelPath, params);
|
||||
}
|
||||
|
||||
private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
||||
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
if (ctx != null) {
|
||||
lib.whisper_free(ctx);
|
||||
}
|
||||
@ -69,13 +69,15 @@ public class WhisperCpp implements AutoCloseable {
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
||||
* Returns a ByValue instance to ensure proper parameter passing to native code.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_context_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*/
|
||||
public WhisperContextParams.ByValue getContextDefaultParams() {
|
||||
WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
|
||||
lib.whisper_context_default_params_by_ref());
|
||||
valueParams.read();
|
||||
return valueParams;
|
||||
public WhisperContextParams getContextDefaultParams() {
|
||||
paramsPointer = lib.whisper_context_default_params_by_ref();
|
||||
WhisperContextParams params = new WhisperContextParams(paramsPointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -86,7 +88,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
*
|
||||
* @param strategy - GREEDY
|
||||
*/
|
||||
public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
||||
public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
||||
Pointer pointer;
|
||||
|
||||
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
|
||||
@ -102,7 +104,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
pointer = beamParamsPointer;
|
||||
}
|
||||
|
||||
WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
|
||||
WhisperFullParams params = new WhisperFullParams(pointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
@ -136,21 +138,15 @@ public class WhisperCpp implements AutoCloseable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
|
||||
public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
/*
|
||||
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
||||
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
||||
valueParams.read();
|
||||
*/
|
||||
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
@ -167,16 +163,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
|
||||
return str.toString().trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Full transcribe with time list.
|
||||
*
|
||||
* @param whisperParams the whisper params
|
||||
* @param audioData the audio data
|
||||
* @return the list
|
||||
* @throws IOException the io exception
|
||||
*/
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
@ -188,6 +175,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
int nSegments = lib.whisper_full_n_segments(ctx);
|
||||
List<WhisperSegment> segments= new ArrayList<>(nSegments);
|
||||
|
||||
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
long t0 = lib.whisper_full_get_segment_t0(ctx, i);
|
||||
String text = lib.whisper_full_get_segment_text(ctx, i);
|
||||
|
@ -9,7 +9,6 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
|
||||
public interface WhisperCppJnaLibrary extends Library {
|
||||
|
||||
WhisperCppJnaLibrary instance = Native.load("whisper", WhisperCppJnaLibrary.class);
|
||||
|
||||
String whisper_print_system_info();
|
||||
@ -39,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
* @param params Pointer to whisper_context_params
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
|
||||
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a buffer.
|
||||
@ -181,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
/**
|
||||
* @return the id of the specified language, returns -1 if not found.
|
||||
* Examples:
|
||||
* "de" -> 2
|
||||
* "german" -> 2
|
||||
* "de" -> 2
|
||||
* "german" -> 2
|
||||
*/
|
||||
int whisper_lang_id(String lang);
|
||||
|
||||
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
||||
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
||||
String whisper_lang_str(int id);
|
||||
|
||||
/**
|
||||
@ -269,21 +268,20 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
void whisper_free_params(Pointer params);
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
|
||||
int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
|
||||
//int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
// Result is stored in the default state of the context
|
||||
// Not thread safe if executed in parallel on the same context.
|
||||
// It seems this approach can offer some speedup in some cases.
|
||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
|
||||
int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
|
||||
|
||||
/**
|
||||
* Number of generated text segments.
|
||||
|
@ -1,17 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
|
||||
/**
|
||||
* Callback for aborting GGML computation
|
||||
* Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
|
||||
*/
|
||||
public interface GgmlAbortCallback extends Callback {
|
||||
/**
|
||||
* Return true to abort the computation, false to continue
|
||||
*
|
||||
* @param data User data passed to the callback
|
||||
* @return true to abort, false to continue
|
||||
*/
|
||||
boolean invoke(com.sun.jna.Pointer data);
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
import com.sun.jna.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperAhead extends Structure {
|
||||
|
||||
public int n_text_layer;
|
||||
|
||||
public int n_head;
|
||||
|
||||
public WhisperAhead() {
|
||||
super();
|
||||
}
|
||||
|
||||
public WhisperAhead(int textLayer, int head) {
|
||||
super();
|
||||
this.n_text_layer = textLayer;
|
||||
this.n_head = head;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("n_text_layer", "n_head");
|
||||
}
|
||||
|
||||
public static class ByReference extends WhisperAhead implements Structure.ByReference {}
|
||||
|
||||
public static class ByValue extends WhisperAhead implements Structure.ByValue {}
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
import com.sun.jna.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperAheads extends Structure {
|
||||
public NativeLong n_heads;
|
||||
|
||||
public Pointer heads;
|
||||
|
||||
public WhisperAheads() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create alignment heads from an array of WhisperAhead objects
|
||||
*/
|
||||
public void setHeads(WhisperAhead[] aheadsArray) {
|
||||
this.n_heads = new NativeLong(aheadsArray.length);
|
||||
|
||||
int structSize = aheadsArray[0].size();
|
||||
Memory mem = new Memory(structSize * aheadsArray.length);
|
||||
|
||||
for (int i = 0; i < aheadsArray.length; i++) {
|
||||
aheadsArray[i].write();
|
||||
byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
|
||||
mem.write(i * structSize, buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
this.heads = mem;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("n_heads", "heads");
|
||||
}
|
||||
|
||||
public static class ByReference extends WhisperAheads implements Structure.ByReference {}
|
||||
|
||||
public static class ByValue extends WhisperAheads implements Structure.ByValue {}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.*;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@ -9,73 +11,21 @@ import java.util.List;
|
||||
* whisper_context_default_params()
|
||||
*/
|
||||
public class WhisperContextParams extends Structure {
|
||||
|
||||
public WhisperContextParams(Pointer p) {
|
||||
super(p);
|
||||
}
|
||||
|
||||
public WhisperContextParams() {
|
||||
super();
|
||||
}
|
||||
|
||||
/** Use GPU for inference (default = true) */
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public CBool use_gpu;
|
||||
|
||||
/** Use flash attention (default = false) */
|
||||
public CBool flash_attn;
|
||||
|
||||
/** CUDA device to use (default = 0) */
|
||||
public int gpu_device;
|
||||
|
||||
/** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
|
||||
public CBool dtw_token_timestamps;
|
||||
|
||||
/** [EXPERIMENTAL] Alignment heads preset for DTW */
|
||||
public int dtw_aheads_preset;
|
||||
|
||||
/** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
|
||||
public int dtw_n_top;
|
||||
|
||||
public WhisperAheads.ByValue dtw_aheads;
|
||||
|
||||
/** DTW memory size (internal use) */
|
||||
public NativeLong dtw_mem_size;
|
||||
|
||||
/** Use GPU for inference */
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public void useGpu(boolean enable) {
|
||||
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Use flash attention */
|
||||
public void useFlashAttn(boolean enable) {
|
||||
flash_attn = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Enable DTW token-level timestamps */
|
||||
public void enableDtwTokenTimestamps(boolean enable) {
|
||||
dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Set DTW alignment heads preset */
|
||||
public void setDtwAheadsPreset(int preset) {
|
||||
dtw_aheads_preset = preset;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList(
|
||||
"use_gpu",
|
||||
"flash_attn",
|
||||
"gpu_device",
|
||||
"dtw_token_timestamps",
|
||||
"dtw_aheads_preset",
|
||||
"dtw_n_top",
|
||||
"dtw_aheads",
|
||||
"dtw_mem_size"
|
||||
);
|
||||
}
|
||||
|
||||
public static class ByValue extends WhisperContextParams implements Structure.ByValue {
|
||||
public ByValue() { super(); }
|
||||
public ByValue(Pointer p) { super(p); }
|
||||
return Arrays.asList("use_gpu");
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
@ -17,12 +16,10 @@ import java.util.List;
|
||||
*/
|
||||
public class WhisperFullParams extends Structure {
|
||||
|
||||
public WhisperFullParams() {
|
||||
super();
|
||||
}
|
||||
|
||||
public WhisperFullParams(Pointer p) {
|
||||
super(p);
|
||||
// super(p, ALIGN_MSVC);
|
||||
// super(p, ALIGN_GNUC);
|
||||
}
|
||||
|
||||
/** Sampling strategy for whisper_full() function. */
|
||||
@ -72,10 +69,10 @@ public class WhisperFullParams extends Structure {
|
||||
single_segment = single ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public CBool print_special;
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public void printSpecial(boolean enable) {
|
||||
print_special = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
@ -132,14 +129,6 @@ public class WhisperFullParams extends Structure {
|
||||
/** Maximum tokens per segment (0, default = no limit) */
|
||||
public int max_tokens;
|
||||
|
||||
/** [EXPERIMENTAL] Enable debug mode for extra info */
|
||||
public CBool debug_mode;
|
||||
|
||||
/** Enable debug mode */
|
||||
public void enableDebugMode(boolean enable) {
|
||||
debug_mode = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Overwrite the audio context size (0 = use default). */
|
||||
public int audio_ctx;
|
||||
|
||||
@ -285,16 +274,6 @@ public class WhisperFullParams extends Structure {
|
||||
*/
|
||||
public Pointer encoder_begin_callback_user_data;
|
||||
|
||||
/** Callback used to abort GGML computation */
|
||||
public Pointer abort_callback;
|
||||
|
||||
/** User data for the abort_callback */
|
||||
public Pointer abort_callback_user_data;
|
||||
|
||||
public void setAbortCallback(GgmlAbortCallback callback) {
|
||||
abort_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback by each decoder to filter obtained logits.
|
||||
* WhisperLogitsFilterCallback
|
||||
@ -331,28 +310,17 @@ public class WhisperFullParams extends Structure {
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
|
||||
"offset_ms", "duration_ms", "translate", "no_context",
|
||||
"no_timestamps", "single_segment", "print_special",
|
||||
"print_progress", "print_realtime", "print_timestamps",
|
||||
"token_timestamps", "thold_pt", "thold_ptsum", "max_len",
|
||||
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt",
|
||||
"prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature",
|
||||
"max_initial_ts", "length_penalty", "temperature_inc",
|
||||
"entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
|
||||
"beam_search", "new_segment_callback", "new_segment_callback_user_data",
|
||||
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
|
||||
"no_context", "single_segment", "no_timestamps",
|
||||
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
||||
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",
|
||||
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
||||
"new_segment_callback", "new_segment_callback_user_data",
|
||||
"progress_callback", "progress_callback_user_data",
|
||||
"encoder_begin_callback", "encoder_begin_callback_user_data",
|
||||
"abort_callback", "abort_callback_user_data",
|
||||
"logits_filter_callback", "logits_filter_callback_user_data",
|
||||
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
|
||||
}
|
||||
|
||||
public static class ByValue extends WhisperFullParams implements Structure.ByValue {
|
||||
public ByValue() { super(); }
|
||||
public ByValue(Pointer p) { super(p); }
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ class WhisperCppTest {
|
||||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
@ -118,7 +118,7 @@ class WhisperCppTest {
|
||||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
|
@ -33,9 +33,6 @@ mkdir build-em && cd build-em
|
||||
emcmake cmake .. && make -j
|
||||
|
||||
# run test
|
||||
node ../tests/test-whisper.js
|
||||
|
||||
# For Node.js versions prior to v16.4.0, experimental features need to be enabled:
|
||||
node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
# publish npm package
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.5",
|
||||
"version": "1.7.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
6
bindings/ruby/.gitignore
vendored
6
bindings/ruby/.gitignore
vendored
@ -1,9 +1,3 @@
|
||||
LICENSE
|
||||
pkg/
|
||||
lib/whisper.*
|
||||
ext/examples/
|
||||
ext/ggml/
|
||||
ext/include/
|
||||
ext/scripts/
|
||||
ext/src/
|
||||
test/fixtures/
|
||||
|
@ -16,32 +16,6 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
||||
|
||||
$ gem install whispercpp
|
||||
|
||||
You can pass build options for whisper.cpp, for instance:
|
||||
|
||||
$ bundle config build.whispercpp --enable-ggml-cuda
|
||||
|
||||
or,
|
||||
|
||||
$ gem install whispercpp -- --enable-ggml-cuda
|
||||
|
||||
See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options, for example:
|
||||
|
||||
Boolean options:
|
||||
|
||||
* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
|
||||
* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
|
||||
|
||||
Argument options:
|
||||
|
||||
* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
|
||||
|
||||
Combination:
|
||||
|
||||
* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
|
||||
|
||||
For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.
|
||||
For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
@ -70,6 +44,17 @@ end
|
||||
|
||||
Some models are prepared up-front:
|
||||
|
||||
```ruby
|
||||
base_en = Whisper::Model.pre_converted_models["base.en"]
|
||||
whisper = Whisper::Context.new(base_en)
|
||||
```
|
||||
|
||||
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
|
||||
|
||||
```ruby
|
||||
Whisper::Model.pre_converted_models["base"].clear_cache
|
||||
```
|
||||
|
||||
You also can use shorthand for pre-converted models:
|
||||
|
||||
```ruby
|
||||
@ -94,19 +79,6 @@ puts Whisper::Model.pre_converted_models.keys
|
||||
# :
|
||||
```
|
||||
|
||||
You can also retrieve each model:
|
||||
|
||||
```ruby
|
||||
base_en = Whisper::Model.pre_converted_models["base.en"]
|
||||
whisper = Whisper::Context.new(base_en)
|
||||
```
|
||||
|
||||
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
|
||||
|
||||
```ruby
|
||||
Whisper::Model.pre_converted_models["base"].clear_cache
|
||||
```
|
||||
|
||||
You can also use local model files you prepared:
|
||||
|
||||
```ruby
|
||||
@ -127,80 +99,9 @@ See [models][] page for details.
|
||||
|
||||
Currently, whisper.cpp accepts only 16-bit WAV files.
|
||||
|
||||
### Voice Activity Detection (VAD) ###
|
||||
|
||||
Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
|
||||
|
||||
```ruby
|
||||
Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v5.1.2",
|
||||
# other arguments...
|
||||
)
|
||||
```
|
||||
|
||||
When you pass the model name (`"silero-v5.1.2"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin`), it will be downloaded automatically.
|
||||
Currently, "silero-v5.1.2" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
|
||||
|
||||
If you need configure VAD behavior, pass params for that:
|
||||
|
||||
```ruby
|
||||
Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v5.1.2",
|
||||
vad_params: Whisper::VAD::Params.new(
|
||||
threshold: 1.0, # defaults to 0.5
|
||||
min_speech_duration_ms: 500, # defaults to 250
|
||||
min_silence_duration_ms: 200, # defaults to 100
|
||||
max_speech_duration_s: 30000, # default is FLT_MAX,
|
||||
speech_pad_ms: 50, # defaults to 30
|
||||
samples_overlap: 0.5 # defaults to 0.1
|
||||
),
|
||||
# other arguments...
|
||||
)
|
||||
```
|
||||
|
||||
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
|
||||
|
||||
### Output ###
|
||||
|
||||
whispercpp supports SRT and WebVTT output:
|
||||
|
||||
```ruby
|
||||
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
|
||||
# =>
|
||||
WEBVTT
|
||||
|
||||
1
|
||||
00:00:00.000 --> 00:00:03.860
|
||||
My thought I have nobody by a beauty and will as you poured.
|
||||
|
||||
2
|
||||
00:00:03.860 --> 00:00:09.840
|
||||
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
|
||||
|
||||
3
|
||||
00:00:09.840 --> 00:00:09.940
|
||||
a
|
||||
|
||||
```
|
||||
|
||||
You may call `#to_srt`, too
|
||||
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
### Transcription ###
|
||||
|
||||
By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
|
||||
|
||||
```ruby
|
||||
whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
|
||||
```
|
||||
|
||||
Note that transcription occasionally might be low accuracy when it works in parallel.
|
||||
|
||||
### Segments ###
|
||||
|
||||
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
||||
@ -222,7 +123,7 @@ whisper
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
@ -238,7 +139,7 @@ params.on_new_segment do |segment|
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
@ -327,7 +228,7 @@ The second argument `samples` may be an array, an object with `length` and `each
|
||||
Development
|
||||
-----------
|
||||
|
||||
% git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
% git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
% cd whisper.cpp/bindings/ruby
|
||||
% rake test
|
||||
|
||||
@ -335,15 +236,10 @@ First call of `rake test` builds an extension and downloads a model for testing.
|
||||
|
||||
If something seems wrong on build, running `rake clean` solves some cases.
|
||||
|
||||
### Need help ###
|
||||
|
||||
* Windows support
|
||||
* Refinement of C/C++ code, especially memory management
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
The same to [whisper.cpp][].
|
||||
|
||||
[whisper.cpp]: https://github.com/ggml-org/whisper.cpp
|
||||
[models]: https://github.com/ggml-org/whisper.cpp/tree/master/models
|
||||
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
|
||||
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
|
||||
|
@ -3,15 +3,11 @@ require "bundler/gem_tasks"
|
||||
require "rake/testtask"
|
||||
require_relative "extsources"
|
||||
|
||||
SOURCES_DIR = "ext/sources"
|
||||
|
||||
SOURCES = FileList[]
|
||||
|
||||
EXTSOURCES.each do |src|
|
||||
basename = src.pathmap("%f")
|
||||
dest = basename == "LICENSE" ? basename
|
||||
: src.pathmap("%{\\.\\./\\.\\.,#{SOURCES_DIR}}p")
|
||||
.pathmap("%{\\.\\./javascript,#{SOURCES_DIR}/bindings/javascript}p")
|
||||
dest = basename == "LICENSE" ? basename : src.pathmap("%{../..,ext}p")
|
||||
dir = dest.pathmap("%d")
|
||||
file src
|
||||
directory dir
|
||||
@ -22,6 +18,7 @@ EXTSOURCES.each do |src|
|
||||
end
|
||||
|
||||
CLEAN.include SOURCES
|
||||
CLEAN.include FileList["ext/**/*.o", "ext/**/*.metal", "ext/**/*.tmp", "ext/whisper.{so,bundle,dll}"]
|
||||
|
||||
SRC = FileList["ext/*.{c,cpp,h}"]
|
||||
|
||||
@ -39,20 +36,6 @@ file "ext/Makefile" => SRC + ["ext/extconf.rb"] + SOURCES do |t|
|
||||
ruby "extconf.rb"
|
||||
end
|
||||
end
|
||||
if File.exist? "ext/Makefile"
|
||||
task :make_clean do
|
||||
cd "ext" do
|
||||
sh "make", "clean"
|
||||
end
|
||||
end
|
||||
task clean: :make_clean
|
||||
task :make_distclean do
|
||||
cd "ext" do
|
||||
sh "make", "distclean"
|
||||
end
|
||||
end
|
||||
task clobber: :make_distclean
|
||||
end
|
||||
|
||||
file SO_FILE => "ext/Makefile" do |t|
|
||||
chdir "ext" do
|
||||
@ -67,30 +50,17 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
|
||||
end
|
||||
CLEAN.include LIB_FILE
|
||||
|
||||
Rake::TestTask.new
|
||||
|
||||
TEST_FIXTURE_AUDIO = "test/fixtures/jfk.wav"
|
||||
TEST_FIXTURE_AUDIO_SRC = File.expand_path(File.join(__dir__, "..", "..", "samples", "jfk.wav"))
|
||||
TEST_FIXTURE_AUDIO_DIR = TEST_FIXTURE_AUDIO.pathmap("%d")
|
||||
directory TEST_FIXTURE_AUDIO_DIR
|
||||
if File.exist? TEST_FIXTURE_AUDIO_SRC
|
||||
file TEST_FIXTURE_AUDIO => [TEST_FIXTURE_AUDIO_SRC, TEST_FIXTURE_AUDIO_DIR] do |t|
|
||||
symlink t.source, t.name
|
||||
end
|
||||
else
|
||||
require "open-uri"
|
||||
file TEST_FIXTURE_AUDIO => TEST_FIXTURE_AUDIO_DIR do |t|
|
||||
File.write t.name, URI("https://github.com/ggml-org/whisper.cpp/raw/refs/heads/master/samples/jfk.wav").read
|
||||
end
|
||||
Rake::TestTask.new do |t|
|
||||
t.test_files = FileList["tests/test_*.rb"]
|
||||
end
|
||||
|
||||
TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
|
||||
file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
|
||||
chdir "test/jfk_reader" do
|
||||
TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
|
||||
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
|
||||
chdir "tests/jfk_reader" do
|
||||
ruby "extconf.rb"
|
||||
sh "make"
|
||||
end
|
||||
end
|
||||
CLEAN.include TEST_MEMORY_VIEW
|
||||
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
|
||||
|
||||
task test: [LIB_FILE, TEST_MEMORY_VIEW, TEST_FIXTURE_AUDIO]
|
||||
task test: [LIB_FILE, TEST_MEMORY_VIEW]
|
||||
|
10
bindings/ruby/ext/.gitignore
vendored
10
bindings/ruby/ext/.gitignore
vendored
@ -2,8 +2,10 @@ Makefile
|
||||
whisper.so
|
||||
whisper.bundle
|
||||
whisper.dll
|
||||
scripts/get-flags.mk
|
||||
*.o
|
||||
*.a
|
||||
sources/*
|
||||
!sources/CMakeGraphVizOptions.cmake
|
||||
mkmf.log
|
||||
/*/**/*.c
|
||||
/*/**/*.cpp
|
||||
/*/**/*.h
|
||||
/*/**/*.m
|
||||
/*/**/*.metal
|
||||
|
9
bindings/ruby/ext/cpu.mk
Normal file
9
bindings/ruby/ext/cpu.mk
Normal file
@ -0,0 +1,9 @@
|
||||
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/include/ggml-cpu.h \
|
||||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@ -1,73 +0,0 @@
|
||||
require "tsort"
|
||||
|
||||
class Dependencies
|
||||
include TSort
|
||||
|
||||
def initialize(cmake, options)
|
||||
@cmake = cmake
|
||||
@options = options
|
||||
@static_lib_shape = nil
|
||||
@nodes = {}
|
||||
@graph = Hash.new {|h, k| h[k] = []}
|
||||
|
||||
generate_dot
|
||||
parse_dot
|
||||
end
|
||||
|
||||
def libs
|
||||
tsort.filter_map {|node|
|
||||
label, shape = @nodes[node]
|
||||
if shape == @static_lib_shape
|
||||
label.gsub(/\\n\([^)]+\)/, '')
|
||||
else
|
||||
nil
|
||||
end
|
||||
}.reverse.collect {|lib| "lib#{lib}.a"}
|
||||
end
|
||||
|
||||
def to_s
|
||||
libs.join(" ")
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def dot_path
|
||||
File.join(__dir__, "build", "whisper.cpp.dot")
|
||||
end
|
||||
|
||||
def generate_dot
|
||||
args = ["-S", "sources", "-B", "build", "--graphviz", dot_path, "-D", "BUILD_SHARED_LIBS=OFF"]
|
||||
args << @options.to_s unless @options.to_s.empty?
|
||||
system @cmake, *args, exception: true
|
||||
end
|
||||
|
||||
def parse_dot
|
||||
File.open(dot_path).each_line do |line|
|
||||
case line
|
||||
when /\[\s*label\s*=\s*"Static Library"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]/
|
||||
@static_lib_shape = $~[:shape]
|
||||
when /\A\s*"(?<node>\w+)"\s*\[\s*label\s*=\s*"(?<label>\S+)"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]\s*;\s*\z/
|
||||
node = $~[:node]
|
||||
label = $~[:label]
|
||||
shape = $~[:shape]
|
||||
@nodes[node] = [label, shape]
|
||||
when /\A\s*"(?<depender>\w+)"\s*->\s*"(?<dependee>\w+)"/
|
||||
depender = $~[:depender]
|
||||
dependee = $~[:dependee]
|
||||
@graph[depender] << dependee
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def tsort_each_node
|
||||
@nodes.each_key do |node|
|
||||
yield node
|
||||
end
|
||||
end
|
||||
|
||||
def tsort_each_child(node)
|
||||
@graph[node].each do |child|
|
||||
yield child
|
||||
end
|
||||
end
|
||||
end
|
@ -1,22 +1,206 @@
|
||||
require "mkmf"
|
||||
require_relative "options"
|
||||
require_relative "dependencies"
|
||||
require 'mkmf'
|
||||
|
||||
cmake = find_executable("cmake") || abort
|
||||
options = Options.new(cmake)
|
||||
have_library("gomp") rescue nil
|
||||
libs = Dependencies.new(cmake, options)
|
||||
# need to use c++ compiler flags
|
||||
$CXXFLAGS << ' -std=c++17'
|
||||
|
||||
$INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples"
|
||||
$LOCAL_LIBS << " #{libs}"
|
||||
$cleanfiles << " build #{libs}"
|
||||
$LDFLAGS << ' -lstdc++'
|
||||
|
||||
create_makefile "whisper" do |conf|
|
||||
conf << <<~EOF
|
||||
$(TARGET_SO): #{libs}
|
||||
#{libs}: cmake-targets
|
||||
cmake-targets:
|
||||
#{"\t"}#{cmake} -S sources -B build -D BUILD_SHARED_LIBS=OFF -D CMAKE_ARCHIVE_OUTPUT_DIRECTORY=#{__dir__} -D CMAKE_POSITION_INDEPENDENT_CODE=ON #{options}
|
||||
#{"\t"}#{cmake} --build build --config Release --target common whisper
|
||||
EOF
|
||||
# Set to true when building binary gems
|
||||
if enable_config('static-stdlib', false)
|
||||
$LDFLAGS << ' -static-libgcc -static-libstdc++'
|
||||
end
|
||||
|
||||
if enable_config('march-tune-native', false)
|
||||
$CFLAGS << ' -march=native -mtune=native'
|
||||
$CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_METAL']
|
||||
$GGML_METAL ||= true
|
||||
$DEPRECATE_WARNING ||= true
|
||||
end
|
||||
|
||||
$UNAME_S = `uname -s`.chomp
|
||||
$UNAME_P = `uname -p`.chomp
|
||||
$UNAME_M = `uname -m`.chomp
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
unless ENV['GGML_NO_METAL']
|
||||
$GGML_METAL ||= true
|
||||
end
|
||||
$GGML_NO_OPENMP ||= true
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$GGML_METAL_EMBED_LIBRARY = true
|
||||
end
|
||||
|
||||
$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples'
|
||||
$MK_CFLAGS = '-std=c11 -fPIC'
|
||||
$MK_CXXFLAGS = '-std=c++17 -fPIC'
|
||||
$MK_NVCCFLAGS = '-std=c++17'
|
||||
$MK_LDFLAGS = ''
|
||||
|
||||
$OBJ_GGML = []
|
||||
$OBJ_WHISPER = []
|
||||
$OBJ_COMMON = []
|
||||
$OBJ_SDL = []
|
||||
|
||||
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
|
||||
|
||||
if $UNAME_S == 'Linux'
|
||||
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
|
||||
end
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_DEBUG']
|
||||
$MK_CFLAGS << ' -O0 -g'
|
||||
$MK_CXXFLAGS << ' -O0 -g'
|
||||
$MK_LDFLAGS << ' -g'
|
||||
$MK_NVCCFLAGS << ' -O0 -g'
|
||||
else
|
||||
$MK_CPPFLAGS << ' -DNDEBUG'
|
||||
$MK_CFLAGS << ' -O3'
|
||||
$MK_CXXFLAGS << ' -O3'
|
||||
$MK_NVCCFLAGS << ' -O3'
|
||||
end
|
||||
|
||||
$WARN_FLAGS =
|
||||
' -Wall' <<
|
||||
' -Wextra' <<
|
||||
' -Wpedantic' <<
|
||||
' -Wcast-qual' <<
|
||||
' -Wno-unused-function'
|
||||
|
||||
$MK_CFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wshadow' <<
|
||||
' -Wstrict-prototypes' <<
|
||||
' -Wpointer-arith' <<
|
||||
' -Wmissing-prototypes' <<
|
||||
' -Werror=implicit-int' <<
|
||||
' -Werror=implicit-function-declaration'
|
||||
|
||||
$MK_CXXFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wmissing-declarations' <<
|
||||
' -Wmissing-noreturn'
|
||||
|
||||
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
|
||||
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
|
||||
end
|
||||
|
||||
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
|
||||
$MK_CFLAGS << ' -pthread'
|
||||
$MK_CXXFLAGS << ' -pthread'
|
||||
end
|
||||
|
||||
unless $_WIN32
|
||||
$DSO_EXT = '.so'
|
||||
else
|
||||
$DSO_EXT = '.dll'
|
||||
end
|
||||
|
||||
unless ENV['RISCV']
|
||||
if %w[x86_64 i686 amd64].include? $UNAME_M
|
||||
$HOST_CXXFLAGS ||= ''
|
||||
|
||||
$MK_CFLAGS << ' -march=native -mtune=native'
|
||||
$HOST_CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
else
|
||||
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
end
|
||||
|
||||
unless ENV['GGML_NO_ACCELERATE']
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
|
||||
$MK_LDFLAGS << ' -framework Accelerate'
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS64']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
|
||||
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
|
||||
|
||||
if ENV['GGML_METAL_NDEBUG']
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
|
||||
end
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
|
||||
end
|
||||
end
|
||||
|
||||
$OBJ_GGML <<
|
||||
'ggml/src/ggml.o' <<
|
||||
'ggml/src/ggml-alloc.o' <<
|
||||
'ggml/src/ggml-backend.o' <<
|
||||
'ggml/src/ggml-backend-reg.o' <<
|
||||
'ggml/src/ggml-opt.o' <<
|
||||
'ggml/src/ggml-quants.o' <<
|
||||
'ggml/src/ggml-threading.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-traits.o'
|
||||
|
||||
$OBJ_WHISPER <<
|
||||
'src/whisper.o'
|
||||
|
||||
$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
|
||||
$objs <<
|
||||
"ruby_whisper.o" <<
|
||||
"ruby_whisper_context.o" <<
|
||||
"ruby_whisper_transcribe.o" <<
|
||||
"ruby_whisper_params.o" <<
|
||||
"ruby_whisper_error.o" <<
|
||||
"ruby_whisper_segment.o" <<
|
||||
"ruby_whisper_model.o"
|
||||
|
||||
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
|
||||
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
|
||||
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
|
||||
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
|
||||
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
|
||||
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
|
||||
|
||||
create_makefile('whisper')
|
||||
|
||||
File.open 'Makefile', 'a' do |file|
|
||||
file.puts 'include scripts/get-flags.mk'
|
||||
file.puts 'include cpu.mk'
|
||||
|
||||
if $GGML_METAL
|
||||
file.puts 'include metal.mk'
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
file.puts 'include metal-embed.mk'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
17
bindings/ruby/ext/metal-embed.mk
Normal file
17
bindings/ruby/ext/metal-embed.mk
Normal file
@ -0,0 +1,17 @@
|
||||
ggml/src/ggml-metal/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.metal \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/src/ggml-common.h
|
||||
@echo "Embedding Metal library"
|
||||
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
|
||||
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
|
||||
@rmdir ${TEMP_ASSEMBLY}
|
6
bindings/ruby/ext/metal.mk
Normal file
6
bindings/ruby/ext/metal.mk
Normal file
@ -0,0 +1,6 @@
|
||||
ggml/src/ggml-metal/ggml-metal.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.m \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/include/ggml-metal.h \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
@ -1,82 +0,0 @@
|
||||
class Options
|
||||
def initialize(cmake="cmake")
|
||||
@cmake = cmake
|
||||
@options = {}
|
||||
|
||||
configure
|
||||
end
|
||||
|
||||
def to_s
|
||||
@options
|
||||
.reject {|name, (type, value)| value.nil?}
|
||||
.collect {|name, (type, value)| "-D #{name}=#{value == true ? "ON" : value == false ? "OFF" : value.shellescape}"}
|
||||
.join(" ")
|
||||
end
|
||||
|
||||
def cmake_options
|
||||
return @cmake_options if @cmake_options
|
||||
|
||||
output = nil
|
||||
Dir.chdir __dir__ do
|
||||
output = `#{@cmake.shellescape} -S sources -B build -L`
|
||||
end
|
||||
@cmake_options = output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1)
|
||||
.filter_map {|line|
|
||||
option, value = line.chomp.split("=", 2)
|
||||
name, type = option.split(":", 2)
|
||||
[
|
||||
name,
|
||||
[
|
||||
type,
|
||||
type == "BOOL" ? value == "ON" : value
|
||||
]
|
||||
]
|
||||
}.to_h
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def configure
|
||||
cmake_options.each_pair do |name, (type, default_value)|
|
||||
option = option_name(name)
|
||||
value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}")
|
||||
@options[name] = [type, value]
|
||||
end
|
||||
|
||||
configure_accelerate
|
||||
configure_metal
|
||||
configure_coreml
|
||||
end
|
||||
|
||||
# See ggml/src/ggml-cpu/CMakeLists.txt
|
||||
def configure_accelerate
|
||||
if RUBY_PLATFORM.match?(/darwin/) && enabled?("GGML_ACCELERATE")
|
||||
$LDFLAGS << " -framework Accelerate"
|
||||
end
|
||||
end
|
||||
|
||||
# See ggml/src/ggml-metal/CMakeLists.txt
|
||||
def configure_metal
|
||||
$LDFLAGS << " -framework Foundation -framework Metal -framework MetalKit" if enabled?("GGML_METAL")
|
||||
end
|
||||
|
||||
# See src/CmakeLists.txt
|
||||
def configure_coreml
|
||||
if enabled?("WHISPER_COREML")
|
||||
$LDFLAGS << " -framework Foundation -framework CoreML"
|
||||
$CPPFLAGS << " -DRUBY_WHISPER_USE_COREML"
|
||||
end
|
||||
end
|
||||
|
||||
def option_name(name)
|
||||
name.downcase.gsub("_", "-")
|
||||
end
|
||||
|
||||
def enabled?(option)
|
||||
if @options[option][1].nil?
|
||||
cmake_options[option][1]
|
||||
else
|
||||
@options[option][1]
|
||||
end
|
||||
end
|
||||
end
|
@ -3,10 +3,8 @@
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
VALUE mWhisper;
|
||||
VALUE mVAD;
|
||||
VALUE cContext;
|
||||
VALUE cParams;
|
||||
VALUE cVADParams;
|
||||
VALUE eError;
|
||||
|
||||
VALUE cSegment;
|
||||
@ -22,9 +20,6 @@ ID id_new;
|
||||
ID id_to_path;
|
||||
ID id_URI;
|
||||
ID id_pre_converted_models;
|
||||
ID id_coreml_compiled_models;
|
||||
ID id_cache;
|
||||
ID id_n_processors;
|
||||
|
||||
static bool is_log_callback_finalized = false;
|
||||
|
||||
@ -36,7 +31,6 @@ extern void init_ruby_whisper_params(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_error(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
|
||||
extern void init_ruby_whisper_model(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
|
||||
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
|
||||
|
||||
/*
|
||||
@ -86,14 +80,6 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
|
||||
return rb_str_new2(str_full);
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* system_info_str -> String
|
||||
*/
|
||||
static VALUE ruby_whisper_s_system_info_str(VALUE self) {
|
||||
return rb_str_new2(whisper_print_system_info());
|
||||
}
|
||||
|
||||
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
|
||||
is_log_callback_finalized = true;
|
||||
return Qnil;
|
||||
@ -130,6 +116,16 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
|
||||
static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
||||
ruby_whisper_model *rwm;
|
||||
rwm = ALLOC(ruby_whisper_model);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
|
||||
}
|
||||
|
||||
void Init_whisper() {
|
||||
id_to_s = rb_intern("to_s");
|
||||
id_call = rb_intern("call");
|
||||
@ -141,12 +137,8 @@ void Init_whisper() {
|
||||
id_to_path = rb_intern("to_path");
|
||||
id_URI = rb_intern("URI");
|
||||
id_pre_converted_models = rb_intern("pre_converted_models");
|
||||
id_coreml_compiled_models = rb_intern("coreml_compiled_models");
|
||||
id_cache = rb_intern("cache");
|
||||
id_n_processors = rb_intern("n_processors");
|
||||
|
||||
mWhisper = rb_define_module("Whisper");
|
||||
mVAD = rb_define_module_under(mWhisper, "VAD");
|
||||
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
|
||||
@ -159,7 +151,6 @@ void Init_whisper() {
|
||||
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
|
||||
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
|
||||
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
|
||||
rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0);
|
||||
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
|
||||
rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
|
||||
|
||||
@ -168,9 +159,6 @@ void Init_whisper() {
|
||||
init_ruby_whisper_error(&mWhisper);
|
||||
init_ruby_whisper_segment(&mWhisper, &cContext);
|
||||
init_ruby_whisper_model(&mWhisper);
|
||||
init_ruby_whisper_vad_params(&mVAD);
|
||||
|
||||
rb_require("whisper/context");
|
||||
rb_require("whisper/segment");
|
||||
rb_require("whisper/model/uri");
|
||||
}
|
||||
|
@ -19,15 +19,9 @@ typedef struct {
|
||||
bool diarize;
|
||||
ruby_whisper_callback_container *new_segment_callback_container;
|
||||
ruby_whisper_callback_container *progress_callback_container;
|
||||
ruby_whisper_callback_container *encoder_begin_callback_container;
|
||||
ruby_whisper_callback_container *abort_callback_container;
|
||||
VALUE vad_params;
|
||||
} ruby_whisper_params;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_vad_params params;
|
||||
} ruby_whisper_vad_params;
|
||||
|
||||
typedef struct {
|
||||
VALUE context;
|
||||
int index;
|
||||
|
@ -11,21 +11,15 @@ extern ID id_new;
|
||||
extern ID id_to_path;
|
||||
extern ID id_URI;
|
||||
extern ID id_pre_converted_models;
|
||||
extern ID id_coreml_compiled_models;
|
||||
extern ID id_cache;
|
||||
extern ID id_n_processors;
|
||||
|
||||
extern VALUE cContext;
|
||||
extern VALUE eError;
|
||||
extern VALUE cModel;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_params_type;
|
||||
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
||||
extern VALUE rb_whisper_model_s_new(VALUE context);
|
||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
|
||||
|
||||
ID transcribe_option_names[1];
|
||||
extern VALUE rb_whisper_model_initialize(VALUE context);
|
||||
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
|
||||
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
|
||||
|
||||
static void
|
||||
ruby_whisper_free(ruby_whisper *rw)
|
||||
@ -43,74 +37,19 @@ rb_whisper_mark(ruby_whisper *rw)
|
||||
}
|
||||
|
||||
void
|
||||
rb_whisper_free(void *p)
|
||||
rb_whisper_free(ruby_whisper *rw)
|
||||
{
|
||||
ruby_whisper *rw = (ruby_whisper *)p;
|
||||
ruby_whisper_free(rw);
|
||||
free(rw);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper *rw = (const ruby_whisper *)p;
|
||||
size_t size = sizeof(rw);
|
||||
if (!rw) {
|
||||
return 0;
|
||||
}
|
||||
if (rw->context) {
|
||||
size += sizeof(rw->context);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_type = {
|
||||
"ruby_whisper",
|
||||
{0, rb_whisper_free, ruby_whisper_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper, &ruby_whisper_type, rw);
|
||||
rw = ALLOC(ruby_whisper);
|
||||
rw->context = NULL;
|
||||
return obj;
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_normalize_model_path(VALUE model_path)
|
||||
{
|
||||
VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
|
||||
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
|
||||
if (!NIL_P(pre_converted_model)) {
|
||||
model_path = pre_converted_model;
|
||||
#ifdef RUBY_WHISPER_USE_COREML
|
||||
VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
|
||||
VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
|
||||
if (!NIL_P(coreml_converted_model)) {
|
||||
rb_funcall(coreml_converted_model, id_cache, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (TYPE(model_path) == T_STRING) {
|
||||
const char * model_path_str = StringValueCStr(model_path);
|
||||
if (strncmp("http://", model_path_str, 7) == 0 || strncmp("https://", model_path_str, 8) == 0) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
model_path = rb_class_new_instance(1, &model_path, uri_class);
|
||||
}
|
||||
}
|
||||
else if (rb_obj_is_kind_of(model_path, rb_path2class("URI::HTTP"))) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
model_path = rb_class_new_instance(1, &model_path, uri_class);
|
||||
}
|
||||
if (rb_respond_to(model_path, id_to_path)) {
|
||||
model_path = rb_funcall(model_path, id_to_path, 0);
|
||||
}
|
||||
|
||||
return model_path;
|
||||
return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -127,9 +66,27 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
|
||||
|
||||
// TODO: we can support init from buffer here too maybe another ruby object to expose
|
||||
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
|
||||
whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
|
||||
VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
|
||||
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
|
||||
if (!NIL_P(pre_converted_model)) {
|
||||
whisper_model_file_path = pre_converted_model;
|
||||
}
|
||||
if (TYPE(whisper_model_file_path) == T_STRING) {
|
||||
const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
|
||||
if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
}
|
||||
if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
if (rb_respond_to(whisper_model_file_path, id_to_path)) {
|
||||
whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
|
||||
}
|
||||
if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
|
||||
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
|
||||
}
|
||||
@ -147,7 +104,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
|
||||
VALUE ruby_whisper_model_n_vocab(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_vocab(rw->context));
|
||||
}
|
||||
|
||||
@ -158,7 +115,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
|
||||
VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
||||
}
|
||||
|
||||
@ -169,7 +126,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
|
||||
VALUE ruby_whisper_model_n_audio_state(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
||||
}
|
||||
|
||||
@ -180,7 +137,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
|
||||
VALUE ruby_whisper_model_n_audio_head(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
||||
}
|
||||
|
||||
@ -191,7 +148,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
|
||||
VALUE ruby_whisper_model_n_audio_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
||||
}
|
||||
|
||||
@ -202,7 +159,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
|
||||
VALUE ruby_whisper_model_n_text_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
||||
}
|
||||
|
||||
@ -213,7 +170,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
|
||||
VALUE ruby_whisper_model_n_text_state(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_state(rw->context));
|
||||
}
|
||||
|
||||
@ -224,7 +181,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
|
||||
VALUE ruby_whisper_model_n_text_head(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_head(rw->context));
|
||||
}
|
||||
|
||||
@ -235,7 +192,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
|
||||
VALUE ruby_whisper_model_n_text_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
||||
}
|
||||
|
||||
@ -246,7 +203,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
|
||||
VALUE ruby_whisper_model_n_mels(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_mels(rw->context));
|
||||
}
|
||||
|
||||
@ -257,7 +214,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
|
||||
VALUE ruby_whisper_model_ftype(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_ftype(rw->context));
|
||||
}
|
||||
|
||||
@ -268,7 +225,7 @@ VALUE ruby_whisper_model_ftype(VALUE self)
|
||||
VALUE ruby_whisper_model_type(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return rb_str_new2(whisper_model_type_readable(rw->context));
|
||||
}
|
||||
|
||||
@ -291,9 +248,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
VALUE params = argv[0];
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
rb_memory_view_t view;
|
||||
@ -308,20 +265,13 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)RARRAY_LEN(samples);
|
||||
n_samples = RARRAY_LEN(samples);
|
||||
} else if (memory_view_available_p) {
|
||||
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
ssize_t n_samples_size = view.byte_size / view.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)n_samples_size;
|
||||
n_samples = view.byte_size / view.item_size;
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
@ -346,7 +296,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
||||
}
|
||||
}
|
||||
}
|
||||
prepare_transcription(rwp, &self);
|
||||
register_callbacks(rwp, &self);
|
||||
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
@ -377,9 +327,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
VALUE params = argv[0];
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
int n_processors;
|
||||
@ -409,17 +359,10 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
ssize_t n_samples_size = view.byte_size / view.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)n_samples_size;
|
||||
n_samples = view.byte_size / view.item_size;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)RARRAY_LEN(samples);
|
||||
n_samples = RARRAY_LEN(samples);
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
@ -444,7 +387,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
||||
}
|
||||
}
|
||||
}
|
||||
prepare_transcription(rwp, &self);
|
||||
register_callbacks(rwp, &self);
|
||||
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
@ -463,7 +406,7 @@ static VALUE
|
||||
ruby_whisper_full_n_segments(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_full_n_segments(rw->context));
|
||||
}
|
||||
|
||||
@ -477,7 +420,7 @@ static VALUE
|
||||
ruby_whisper_full_lang_id(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_full_lang_id(rw->context));
|
||||
}
|
||||
|
||||
@ -502,10 +445,10 @@ static VALUE
|
||||
ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
|
||||
return LONG2NUM(t0);
|
||||
return INT2NUM(t0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -520,10 +463,10 @@ static VALUE
|
||||
ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
|
||||
return LONG2NUM(t1);
|
||||
return INT2NUM(t1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -538,7 +481,7 @@ static VALUE
|
||||
ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
|
||||
return speaker_turn_next ? Qtrue : Qfalse;
|
||||
@ -556,7 +499,7 @@ static VALUE
|
||||
ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
|
||||
return rb_str_new2(text);
|
||||
@ -570,7 +513,7 @@ static VALUE
|
||||
ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
|
||||
return DBL2NUM(no_speech_prob);
|
||||
@ -581,7 +524,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
|
||||
static VALUE
|
||||
ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
|
||||
{
|
||||
return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
|
||||
return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -611,11 +554,11 @@ ruby_whisper_each_segment(VALUE self)
|
||||
}
|
||||
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(rw->context);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
rb_yield(rb_whisper_segment_s_new(self, i));
|
||||
rb_yield(rb_whisper_segment_initialize(self, i));
|
||||
}
|
||||
|
||||
return self;
|
||||
@ -628,7 +571,7 @@ ruby_whisper_each_segment(VALUE self)
|
||||
static VALUE
|
||||
ruby_whisper_get_model(VALUE self)
|
||||
{
|
||||
return rb_whisper_model_s_new(self);
|
||||
return rb_whisper_model_initialize(self);
|
||||
}
|
||||
|
||||
void
|
||||
@ -636,8 +579,6 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
||||
{
|
||||
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
|
||||
|
||||
transcribe_option_names[0] = id_n_processors;
|
||||
|
||||
rb_define_alloc_func(cContext, ruby_whisper_allocate);
|
||||
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
|
||||
|
||||
@ -664,7 +605,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
||||
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
||||
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
||||
|
||||
// High level
|
||||
// High leve
|
||||
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
||||
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
||||
|
||||
|
@ -1,44 +1,22 @@
|
||||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
extern VALUE cModel;
|
||||
|
||||
static void rb_whisper_model_mark(void *p) {
|
||||
ruby_whisper_model *rwm = (ruby_whisper_model *)p;
|
||||
if (rwm->context) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_model_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
|
||||
size_t size = sizeof(rwm);
|
||||
if (!rwm) {
|
||||
return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static const rb_data_type_t rb_whisper_model_type = {
|
||||
"ruby_whisper_model",
|
||||
{rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
||||
ruby_whisper_model *rwm;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
rwm = ALLOC(ruby_whisper_model);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
|
||||
}
|
||||
|
||||
VALUE rb_whisper_model_s_new(VALUE context) {
|
||||
VALUE rb_whisper_model_initialize(VALUE context) {
|
||||
ruby_whisper_model *rwm;
|
||||
const VALUE model = ruby_whisper_model_allocate(cModel);
|
||||
TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(model, ruby_whisper_model, rwm);
|
||||
rwm->context = context;
|
||||
return model;
|
||||
};
|
||||
@ -51,9 +29,9 @@ static VALUE
|
||||
ruby_whisper_model_n_vocab(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_vocab(rw->context));
|
||||
}
|
||||
|
||||
@ -65,9 +43,9 @@ static VALUE
|
||||
ruby_whisper_model_n_audio_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
||||
}
|
||||
|
||||
@ -79,9 +57,9 @@ static VALUE
|
||||
ruby_whisper_model_n_audio_state(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
||||
}
|
||||
|
||||
@ -93,9 +71,9 @@ static VALUE
|
||||
ruby_whisper_model_n_audio_head(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
||||
}
|
||||
|
||||
@ -107,9 +85,9 @@ static VALUE
|
||||
ruby_whisper_model_n_audio_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
||||
}
|
||||
|
||||
@ -121,9 +99,9 @@ static VALUE
|
||||
ruby_whisper_model_n_text_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
||||
}
|
||||
|
||||
@ -135,9 +113,9 @@ static VALUE
|
||||
ruby_whisper_model_n_text_state(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_state(rw->context));
|
||||
}
|
||||
|
||||
@ -149,9 +127,9 @@ static VALUE
|
||||
ruby_whisper_model_n_text_head(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_head(rw->context));
|
||||
}
|
||||
|
||||
@ -163,9 +141,9 @@ static VALUE
|
||||
ruby_whisper_model_n_text_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
||||
}
|
||||
|
||||
@ -177,9 +155,9 @@ static VALUE
|
||||
ruby_whisper_model_n_mels(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_mels(rw->context));
|
||||
}
|
||||
|
||||
@ -191,9 +169,9 @@ static VALUE
|
||||
ruby_whisper_model_ftype(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_ftype(rw->context));
|
||||
}
|
||||
|
||||
@ -205,9 +183,9 @@ static VALUE
|
||||
ruby_whisper_model_type(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return rb_str_new2(whisper_model_type_readable(rw->context));
|
||||
}
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#define BOOL_PARAMS_SETTER(self, prop, value) \
|
||||
ruby_whisper_params *rwp; \
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp); \
|
||||
if (value == Qfalse || value == Qnil) { \
|
||||
rwp->params.prop = false; \
|
||||
} else { \
|
||||
@ -13,7 +13,7 @@
|
||||
|
||||
#define BOOL_PARAMS_GETTER(self, prop) \
|
||||
ruby_whisper_params *rwp; \
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp); \
|
||||
if (rwp->params.prop) { \
|
||||
return Qtrue; \
|
||||
} else { \
|
||||
@ -26,16 +26,13 @@
|
||||
rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
|
||||
rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);
|
||||
|
||||
#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 35
|
||||
#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 30
|
||||
|
||||
extern VALUE cParams;
|
||||
extern VALUE cVADParams;
|
||||
|
||||
extern ID id_call;
|
||||
|
||||
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
|
||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||
extern const rb_data_type_t ruby_whisper_vad_params_type;
|
||||
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
|
||||
|
||||
static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
|
||||
static ID id_language;
|
||||
@ -66,19 +63,12 @@ static ID id_new_segment_callback;
|
||||
static ID id_new_segment_callback_user_data;
|
||||
static ID id_progress_callback;
|
||||
static ID id_progress_callback_user_data;
|
||||
static ID id_encoder_begin_callback;
|
||||
static ID id_encoder_begin_callback_user_data;
|
||||
static ID id_abort_callback;
|
||||
static ID id_abort_callback_user_data;
|
||||
static ID id_vad;
|
||||
static ID id_vad_model_path;
|
||||
static ID id_vad_params;
|
||||
|
||||
static void
|
||||
rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
|
||||
{
|
||||
if (rwc == NULL) return;
|
||||
|
||||
rb_gc_mark(rwc->user_data);
|
||||
rb_gc_mark(rwc->callback);
|
||||
rb_gc_mark(rwc->callbacks);
|
||||
@ -110,7 +100,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
|
||||
const int n_segments = whisper_full_n_segments_from_state(state);
|
||||
for (int i = n_new; i > 0; i--) {
|
||||
int i_segment = n_segments - i;
|
||||
VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
|
||||
VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
|
||||
for (int j = 0; j < callbacks_len; j++) {
|
||||
VALUE cb = rb_ary_entry(container->callbacks, j);
|
||||
rb_funcall(cb, id_call, 1, segment);
|
||||
@ -136,33 +126,6 @@ static void progress_callback(struct whisper_context *ctx, struct whisper_state
|
||||
}
|
||||
}
|
||||
|
||||
static bool encoder_begin_callback(struct whisper_context *ctx, struct whisper_state *state, void *user_data) {
|
||||
const ruby_whisper_callback_container *container = (ruby_whisper_callback_container *)user_data;
|
||||
bool is_aborted = false;
|
||||
VALUE result;
|
||||
|
||||
// Currently, doesn't support state because
|
||||
// those require to resolve GC-related problems.
|
||||
if (!NIL_P(container->callback)) {
|
||||
result = rb_funcall(container->callback, id_call, 3, *container->context, Qnil, container->user_data);
|
||||
if (result == Qfalse) {
|
||||
is_aborted = true;
|
||||
}
|
||||
}
|
||||
const long callbacks_len = RARRAY_LEN(container->callbacks);
|
||||
if (0 == callbacks_len) {
|
||||
return !is_aborted;
|
||||
}
|
||||
for (int j = 0; j < callbacks_len; j++) {
|
||||
VALUE cb = rb_ary_entry(container->callbacks, j);
|
||||
result = rb_funcall(cb, id_call, 0);
|
||||
if (result == Qfalse) {
|
||||
is_aborted = true;
|
||||
}
|
||||
}
|
||||
return !is_aborted;
|
||||
}
|
||||
|
||||
static bool abort_callback(void * user_data) {
|
||||
const ruby_whisper_callback_container *container = (ruby_whisper_callback_container *)user_data;
|
||||
if (!NIL_P(container->callback)) {
|
||||
@ -185,7 +148,7 @@ static bool abort_callback(void * user_data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||
void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
||||
rwp->new_segment_callback_container->context = context;
|
||||
rwp->params.new_segment_callback = new_segment_callback;
|
||||
@ -198,12 +161,6 @@ static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||
rwp->params.progress_callback_user_data = rwp->progress_callback_container;
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->encoder_begin_callback_container->callback) || 0 != RARRAY_LEN(rwp->encoder_begin_callback_container->callbacks)) {
|
||||
rwp->encoder_begin_callback_container->context = context;
|
||||
rwp->params.encoder_begin_callback = encoder_begin_callback;
|
||||
rwp->params.encoder_begin_callback_user_data = rwp->encoder_begin_callback_container;
|
||||
}
|
||||
|
||||
if (!NIL_P(rwp->abort_callback_container->callback) || 0 != RARRAY_LEN(rwp->abort_callback_container->callbacks)) {
|
||||
rwp->abort_callback_container->context = context;
|
||||
rwp->params.abort_callback = abort_callback;
|
||||
@ -211,29 +168,12 @@ static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
|
||||
}
|
||||
}
|
||||
|
||||
static void set_vad_params(ruby_whisper_params *rwp)
|
||||
{
|
||||
ruby_whisper_vad_params * rwvp;
|
||||
TypedData_Get_Struct(rwp->vad_params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwp->params.vad_params = rwvp->params;
|
||||
}
|
||||
|
||||
void
|
||||
prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
|
||||
rb_whisper_params_mark(ruby_whisper_params *rwp)
|
||||
{
|
||||
register_callbacks(rwp, context);
|
||||
set_vad_params(rwp);
|
||||
}
|
||||
|
||||
void
|
||||
rb_whisper_params_mark(void *p)
|
||||
{
|
||||
ruby_whisper_params *rwp = (ruby_whisper_params *)p;
|
||||
rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
|
||||
rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
|
||||
rb_whisper_callbcack_container_mark(rwp->encoder_begin_callback_container);
|
||||
rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
|
||||
rb_gc_mark(rwp->vad_params);
|
||||
}
|
||||
|
||||
void
|
||||
@ -242,46 +182,24 @@ ruby_whisper_params_free(ruby_whisper_params *rwp)
|
||||
}
|
||||
|
||||
void
|
||||
rb_whisper_params_free(void *p)
|
||||
rb_whisper_params_free(ruby_whisper_params *rwp)
|
||||
{
|
||||
ruby_whisper_params *rwp = (ruby_whisper_params *)p;
|
||||
// How to free user_data and callback only when not referred to by others?
|
||||
ruby_whisper_params_free(rwp);
|
||||
free(rwp);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_params_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_params *rwp = (const ruby_whisper_params *)p;
|
||||
|
||||
return sizeof(ruby_whisper_params) + sizeof(rwp->params) + sizeof(rwp->vad_params);
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_params_type = {
|
||||
"ruby_whisper_params",
|
||||
{
|
||||
rb_whisper_params_mark,
|
||||
rb_whisper_params_free,
|
||||
ruby_whisper_params_memsize,
|
||||
},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp = ALLOC(ruby_whisper_params);
|
||||
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
rwp->diarize = false;
|
||||
rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
|
||||
rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
|
||||
rwp->progress_callback_container = rb_whisper_callback_container_allocate();
|
||||
rwp->encoder_begin_callback_container = rb_whisper_callback_container_allocate();
|
||||
rwp->abort_callback_container = rb_whisper_callback_container_allocate();
|
||||
return obj;
|
||||
return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -294,7 +212,7 @@ static VALUE
|
||||
ruby_whisper_params_set_language(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
if (value == Qfalse || value == Qnil) {
|
||||
rwp->params.language = "auto";
|
||||
} else {
|
||||
@ -310,7 +228,7 @@ static VALUE
|
||||
ruby_whisper_params_get_language(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
if (rwp->params.language) {
|
||||
return rb_str_new2(rwp->params.language);
|
||||
} else {
|
||||
@ -547,7 +465,7 @@ static VALUE
|
||||
ruby_whisper_params_get_initial_prompt(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->params.initial_prompt == NULL ? Qnil : rb_str_new2(rwp->params.initial_prompt);
|
||||
}
|
||||
/*
|
||||
@ -558,7 +476,7 @@ static VALUE
|
||||
ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.initial_prompt = StringValueCStr(value);
|
||||
return value;
|
||||
}
|
||||
@ -572,7 +490,7 @@ static VALUE
|
||||
ruby_whisper_params_get_diarize(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
if (rwp->diarize) {
|
||||
return Qtrue;
|
||||
} else {
|
||||
@ -587,7 +505,7 @@ static VALUE
|
||||
ruby_whisper_params_set_diarize(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
if (value == Qfalse || value == Qnil) {
|
||||
rwp->diarize = false;
|
||||
} else {
|
||||
@ -606,7 +524,7 @@ static VALUE
|
||||
ruby_whisper_params_get_offset(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return INT2NUM(rwp->params.offset_ms);
|
||||
}
|
||||
/*
|
||||
@ -617,7 +535,7 @@ static VALUE
|
||||
ruby_whisper_params_set_offset(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.offset_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
@ -631,7 +549,7 @@ static VALUE
|
||||
ruby_whisper_params_get_duration(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return INT2NUM(rwp->params.duration_ms);
|
||||
}
|
||||
/*
|
||||
@ -642,7 +560,7 @@ static VALUE
|
||||
ruby_whisper_params_set_duration(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.duration_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
@ -657,7 +575,7 @@ static VALUE
|
||||
ruby_whisper_params_get_max_text_tokens(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return INT2NUM(rwp->params.n_max_text_ctx);
|
||||
}
|
||||
/*
|
||||
@ -668,7 +586,7 @@ static VALUE
|
||||
ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.n_max_text_ctx = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
@ -680,7 +598,7 @@ static VALUE
|
||||
ruby_whisper_params_get_temperature(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.temperature);
|
||||
}
|
||||
/*
|
||||
@ -691,7 +609,7 @@ static VALUE
|
||||
ruby_whisper_params_set_temperature(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.temperature = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -705,7 +623,7 @@ static VALUE
|
||||
ruby_whisper_params_get_max_initial_ts(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.max_initial_ts);
|
||||
}
|
||||
/*
|
||||
@ -716,7 +634,7 @@ static VALUE
|
||||
ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.max_initial_ts = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -728,7 +646,7 @@ static VALUE
|
||||
ruby_whisper_params_get_length_penalty(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.length_penalty);
|
||||
}
|
||||
/*
|
||||
@ -739,7 +657,7 @@ static VALUE
|
||||
ruby_whisper_params_set_length_penalty(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.length_penalty = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -751,7 +669,7 @@ static VALUE
|
||||
ruby_whisper_params_get_temperature_inc(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.temperature_inc);
|
||||
}
|
||||
/*
|
||||
@ -762,7 +680,7 @@ static VALUE
|
||||
ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.temperature_inc = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -776,7 +694,7 @@ static VALUE
|
||||
ruby_whisper_params_get_entropy_thold(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.entropy_thold);
|
||||
}
|
||||
/*
|
||||
@ -787,7 +705,7 @@ static VALUE
|
||||
ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.entropy_thold = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -799,7 +717,7 @@ static VALUE
|
||||
ruby_whisper_params_get_logprob_thold(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.logprob_thold);
|
||||
}
|
||||
/*
|
||||
@ -810,7 +728,7 @@ static VALUE
|
||||
ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.logprob_thold = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -822,7 +740,7 @@ static VALUE
|
||||
ruby_whisper_params_get_no_speech_thold(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return DBL2NUM(rwp->params.no_speech_thold);
|
||||
}
|
||||
/*
|
||||
@ -833,7 +751,7 @@ static VALUE
|
||||
ruby_whisper_params_set_no_speech_thold(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->params.no_speech_thold = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
@ -841,7 +759,7 @@ static VALUE
|
||||
ruby_whisper_params_get_new_segment_callback(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->new_segment_callback_container->callback;
|
||||
}
|
||||
/*
|
||||
@ -858,7 +776,7 @@ static VALUE
|
||||
ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->new_segment_callback_container->callback = value;
|
||||
return value;
|
||||
}
|
||||
@ -866,7 +784,7 @@ static VALUE
|
||||
ruby_whisper_params_get_new_segment_callback_user_data(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->new_segment_callback_container->user_data;
|
||||
}
|
||||
/*
|
||||
@ -879,7 +797,7 @@ static VALUE
|
||||
ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->new_segment_callback_container->user_data = value;
|
||||
return value;
|
||||
}
|
||||
@ -887,7 +805,7 @@ static VALUE
|
||||
ruby_whisper_params_get_progress_callback(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->progress_callback_container->callback;
|
||||
}
|
||||
/*
|
||||
@ -906,7 +824,7 @@ static VALUE
|
||||
ruby_whisper_params_set_progress_callback(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->progress_callback_container->callback = value;
|
||||
return value;
|
||||
}
|
||||
@ -914,7 +832,7 @@ static VALUE
|
||||
ruby_whisper_params_get_progress_callback_user_data(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->progress_callback_container->user_data;
|
||||
}
|
||||
/*
|
||||
@ -927,66 +845,15 @@ static VALUE
|
||||
ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->progress_callback_container->user_data = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_get_encoder_begin_callback(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
return rwp->encoder_begin_callback_container->callback;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets encoder begin callback, called when the encoder starts.
|
||||
*
|
||||
* params.encoder_begin_callback = ->(context, _, user_data) {
|
||||
* # ...
|
||||
* }
|
||||
*
|
||||
* call-seq:
|
||||
* encoder_begin_callback = callback -> callback
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_set_encoder_begin_callback(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp->encoder_begin_callback_container->callback = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_get_encoder_begin_callback_user_data(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
return rwp->encoder_begin_callback_container->user_data;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets user data passed to the last argument of encoder begin callback.
|
||||
*
|
||||
* call-seq:
|
||||
* encoder_begin_callback_user_data = user_data -> use_data
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_set_encoder_begin_callback_user_data(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp->encoder_begin_callback_container->user_data = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_get_abort_callback(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->abort_callback_container->callback;
|
||||
}
|
||||
/*
|
||||
@ -1003,7 +870,7 @@ static VALUE
|
||||
ruby_whisper_params_set_abort_callback(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->abort_callback_container->callback = value;
|
||||
return value;
|
||||
}
|
||||
@ -1011,7 +878,7 @@ static VALUE
|
||||
ruby_whisper_params_get_abort_callback_user_data(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
return rwp->abort_callback_container->user_data;
|
||||
}
|
||||
/*
|
||||
@ -1024,74 +891,11 @@ static VALUE
|
||||
ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
rwp->abort_callback_container->user_data = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* vad = use_vad -> use_vad
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_get_vad(VALUE self)
|
||||
{
|
||||
BOOL_PARAMS_GETTER(self, vad)
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_set_vad(VALUE self, VALUE value)
|
||||
{
|
||||
BOOL_PARAMS_SETTER(self, vad, value)
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* vad_model_path = model_path -> model_path
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
if (NIL_P(value)) {
|
||||
rwp->params.vad_model_path = NULL;
|
||||
return value;
|
||||
}
|
||||
VALUE path = ruby_whisper_normalize_model_path(value);
|
||||
rwp->params.vad_model_path = StringValueCStr(path);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_get_vad_model_path(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
return rwp->params.vad_model_path == NULL ? Qnil : rb_str_new2(rwp->params.vad_model_path);
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* vad_params = params -> params
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_set_vad_params(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rwp->vad_params = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_params_get_vad_params(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
return rwp->vad_params;
|
||||
}
|
||||
|
||||
#define SET_PARAM_IF_SAME(param_name) \
|
||||
if (id == id_ ## param_name) { \
|
||||
ruby_whisper_params_set_ ## param_name(self, value); \
|
||||
@ -1101,6 +905,7 @@ ruby_whisper_params_get_vad_params(VALUE self)
|
||||
static VALUE
|
||||
ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
|
||||
VALUE kw_hash;
|
||||
VALUE values[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT] = {Qundef};
|
||||
VALUE value;
|
||||
@ -1113,8 +918,8 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
return self;
|
||||
}
|
||||
|
||||
rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rb_get_kwargs(kw_hash, ¶m_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, &values);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
|
||||
for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
|
||||
id = param_names[i];
|
||||
@ -1153,13 +958,8 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
SET_PARAM_IF_SAME(new_segment_callback_user_data)
|
||||
SET_PARAM_IF_SAME(progress_callback)
|
||||
SET_PARAM_IF_SAME(progress_callback_user_data)
|
||||
SET_PARAM_IF_SAME(encoder_begin_callback)
|
||||
SET_PARAM_IF_SAME(encoder_begin_callback_user_data)
|
||||
SET_PARAM_IF_SAME(abort_callback)
|
||||
SET_PARAM_IF_SAME(abort_callback_user_data)
|
||||
SET_PARAM_IF_SAME(vad)
|
||||
SET_PARAM_IF_SAME(vad_model_path)
|
||||
SET_PARAM_IF_SAME(vad_params)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1181,10 +981,10 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
static VALUE
|
||||
ruby_whisper_params_on_new_segment(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
ruby_whisper_params *rws;
|
||||
Data_Get_Struct(self, ruby_whisper_params, rws);
|
||||
const VALUE blk = rb_block_proc();
|
||||
rb_ary_push(rwp->new_segment_callback_container->callbacks, blk);
|
||||
rb_ary_push(rws->new_segment_callback_container->callbacks, blk);
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
@ -1201,30 +1001,10 @@ ruby_whisper_params_on_new_segment(VALUE self)
|
||||
static VALUE
|
||||
ruby_whisper_params_on_progress(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
ruby_whisper_params *rws;
|
||||
Data_Get_Struct(self, ruby_whisper_params, rws);
|
||||
const VALUE blk = rb_block_proc();
|
||||
rb_ary_push(rwp->progress_callback_container->callbacks, blk);
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hook called when the encoder starts.
|
||||
*
|
||||
* whisper.on_encoder_begin do
|
||||
* # ...
|
||||
* end
|
||||
*
|
||||
* call-seq:
|
||||
* on_encoder_begin { ... }
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_params_on_encoder_begin(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
const VALUE blk = rb_block_proc();
|
||||
rb_ary_push(rwp->encoder_begin_callback_container->callbacks, blk);
|
||||
rb_ary_push(rws->progress_callback_container->callbacks, blk);
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
@ -1245,10 +1025,10 @@ ruby_whisper_params_on_encoder_begin(VALUE self)
|
||||
static VALUE
|
||||
ruby_whisper_params_abort_on(VALUE self)
|
||||
{
|
||||
ruby_whisper_params *rwp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
ruby_whisper_params *rws;
|
||||
Data_Get_Struct(self, ruby_whisper_params, rws);
|
||||
const VALUE blk = rb_block_proc();
|
||||
rb_ary_push(rwp->abort_callback_container->callbacks, blk);
|
||||
rb_ary_push(rws->abort_callback_container->callbacks, blk);
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
@ -1288,16 +1068,10 @@ init_ruby_whisper_params(VALUE *mWhisper)
|
||||
DEFINE_PARAM(new_segment_callback_user_data, 25)
|
||||
DEFINE_PARAM(progress_callback, 26)
|
||||
DEFINE_PARAM(progress_callback_user_data, 27)
|
||||
DEFINE_PARAM(encoder_begin_callback, 28)
|
||||
DEFINE_PARAM(encoder_begin_callback_user_data, 29)
|
||||
DEFINE_PARAM(abort_callback, 30)
|
||||
DEFINE_PARAM(abort_callback_user_data, 31)
|
||||
DEFINE_PARAM(vad, 32)
|
||||
DEFINE_PARAM(vad_model_path, 33)
|
||||
DEFINE_PARAM(vad_params, 34)
|
||||
DEFINE_PARAM(abort_callback, 28)
|
||||
DEFINE_PARAM(abort_callback_user_data, 29)
|
||||
|
||||
rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
|
||||
rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
|
||||
rb_define_method(cParams, "on_encoder_begin", ruby_whisper_params_on_encoder_begin, 0);
|
||||
rb_define_method(cParams, "abort_on", ruby_whisper_params_abort_on, 0);
|
||||
}
|
||||
|
@ -1,57 +1,28 @@
|
||||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 5
|
||||
|
||||
static VALUE sym_start_time;
|
||||
static VALUE sym_end_time;
|
||||
static VALUE sym_text;
|
||||
static VALUE sym_no_speech_prob;
|
||||
static VALUE sym_speaker_turn_next;
|
||||
static VALUE key_names;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
extern VALUE cSegment;
|
||||
|
||||
static void
|
||||
rb_whisper_segment_mark(void *p)
|
||||
rb_whisper_segment_mark(ruby_whisper_segment *rws)
|
||||
{
|
||||
ruby_whisper_segment *rws = (ruby_whisper_segment *)p;
|
||||
rb_gc_mark(rws->context);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_segment_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_segment *rws = (const ruby_whisper_segment *)p;
|
||||
size_t size = sizeof(rws);
|
||||
if (!rws) {
|
||||
return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static const rb_data_type_t ruby_whisper_segment_type = {
|
||||
"ruby_whisper_segment",
|
||||
{rb_whisper_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_segment_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
VALUE
|
||||
ruby_whisper_segment_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
rws = ALLOC(ruby_whisper_segment);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_whisper_segment_s_new(VALUE context, int index)
|
||||
rb_whisper_segment_initialize(VALUE context, int index)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
|
||||
TypedData_Get_Struct(segment, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(segment, ruby_whisper_segment, rws);
|
||||
rws->context = context;
|
||||
rws->index = index;
|
||||
return segment;
|
||||
@ -67,12 +38,12 @@ static VALUE
|
||||
ruby_whisper_segment_get_start_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
|
||||
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
||||
return LONG2NUM(t0 * 10);
|
||||
return INT2NUM(t0 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -85,12 +56,12 @@ static VALUE
|
||||
ruby_whisper_segment_get_end_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
|
||||
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
||||
return LONG2NUM(t1 * 10);
|
||||
return INT2NUM(t1 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -103,9 +74,9 @@ static VALUE
|
||||
ruby_whisper_segment_get_speaker_turn_next(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
|
||||
}
|
||||
|
||||
@ -117,9 +88,9 @@ static VALUE
|
||||
ruby_whisper_segment_get_text(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const char * text = whisper_full_get_segment_text(rw->context, rws->index);
|
||||
return rb_str_new2(text);
|
||||
}
|
||||
@ -132,89 +103,21 @@ static VALUE
|
||||
ruby_whisper_segment_get_no_speech_prob(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* deconstruct_keys(keys) -> hash
|
||||
*
|
||||
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
||||
*
|
||||
* whisper.each_segment do |segment|
|
||||
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
*
|
||||
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
* end
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
|
||||
VALUE hash = rb_hash_new();
|
||||
long n_keys;
|
||||
if (NIL_P(keys)) {
|
||||
keys = key_names;
|
||||
n_keys = N_KEY_NAMES;
|
||||
} else {
|
||||
n_keys = RARRAY_LEN(keys);
|
||||
if (n_keys > N_KEY_NAMES) {
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < n_keys; i++) {
|
||||
VALUE key = rb_ary_entry(keys, i);
|
||||
if (key == sym_start_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
|
||||
}
|
||||
if (key == sym_end_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
|
||||
}
|
||||
if (key == sym_text) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
|
||||
}
|
||||
if (key == sym_no_speech_prob) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
|
||||
}
|
||||
if (key == sym_speaker_turn_next) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
void
|
||||
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
||||
{
|
||||
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
||||
|
||||
sym_start_time = ID2SYM(rb_intern("start_time"));
|
||||
sym_end_time = ID2SYM(rb_intern("end_time"));
|
||||
sym_text = ID2SYM(rb_intern("text"));
|
||||
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
|
||||
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
|
||||
key_names = rb_ary_new3(
|
||||
N_KEY_NAMES,
|
||||
sym_start_time,
|
||||
sym_end_time,
|
||||
sym_text,
|
||||
sym_no_speech_prob,
|
||||
sym_speaker_turn_next
|
||||
);
|
||||
|
||||
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
||||
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
||||
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
||||
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
||||
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
||||
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
#include "common-whisper.h"
|
||||
#define DR_WAV_IMPLEMENTATION
|
||||
#include "dr_wav.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -8,15 +9,11 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
extern const rb_data_type_t ruby_whisper_params_type;
|
||||
|
||||
extern ID id_to_s;
|
||||
extern ID id_call;
|
||||
extern ID transcribe_option_names[1];
|
||||
|
||||
extern void
|
||||
prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
|
||||
register_callbacks(ruby_whisper_params * rwp, VALUE * self);
|
||||
|
||||
/*
|
||||
* transcribe a single file
|
||||
@ -35,16 +32,11 @@ VALUE
|
||||
ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
VALUE wave_file_path, blk, params, kws;
|
||||
VALUE opts[1];
|
||||
VALUE wave_file_path, blk, params;
|
||||
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, ¶ms, &kws, &blk);
|
||||
rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
|
||||
|
||||
int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rb_scan_args(argc, argv, "02&", &wave_file_path, ¶ms, &blk);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
|
||||
if (!rb_respond_to(wave_file_path, id_to_s)) {
|
||||
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
|
||||
@ -55,37 +47,111 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
if (!read_audio_data(fname_inp, pcmf32, pcmf32s, rwp->diarize)) {
|
||||
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
||||
return self;
|
||||
// WAV input - this is directly from main.cpp example
|
||||
{
|
||||
drwav wav;
|
||||
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
||||
|
||||
if (fname_inp == "-") {
|
||||
{
|
||||
uint8_t buf[1024];
|
||||
while (true) {
|
||||
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
wav_data.insert(wav_data.end(), buf, buf + n);
|
||||
}
|
||||
}
|
||||
|
||||
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
||||
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
||||
return self;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
||||
} else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
|
||||
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
||||
return self;
|
||||
}
|
||||
|
||||
if (wav.channels != 1 && wav.channels != 2) {
|
||||
fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
|
||||
return self;
|
||||
}
|
||||
|
||||
if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) {
|
||||
fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
|
||||
return self;
|
||||
}
|
||||
|
||||
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
||||
fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
|
||||
return self;
|
||||
}
|
||||
|
||||
if (wav.bitsPerSample != 16) {
|
||||
fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str());
|
||||
return self;
|
||||
}
|
||||
|
||||
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
||||
|
||||
std::vector<int16_t> pcm16;
|
||||
pcm16.resize(n*wav.channels);
|
||||
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
||||
drwav_uninit(&wav);
|
||||
|
||||
// convert to mono, float
|
||||
pcmf32.resize(n);
|
||||
if (wav.channels == 1) {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32[i] = float(pcm16[i])/32768.0f;
|
||||
}
|
||||
} else {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32[i] = float((int32_t)pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
||||
}
|
||||
}
|
||||
|
||||
if (rwp->diarize) {
|
||||
// convert to stereo, float
|
||||
pcmf32s.resize(2);
|
||||
|
||||
pcmf32s[0].resize(n);
|
||||
pcmf32s[1].resize(n);
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
||||
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Commented out because it is work in progress
|
||||
// {
|
||||
// static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
{
|
||||
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
|
||||
// rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
// bool is_aborted = *(bool*)user_data;
|
||||
// return !is_aborted;
|
||||
// };
|
||||
// rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||
// }
|
||||
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
|
||||
prepare_transcription(rwp, &self);
|
||||
register_callbacks(rwp, &self);
|
||||
|
||||
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
|
||||
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return self;
|
||||
}
|
||||
if (NIL_P(blk)) {
|
||||
return self;
|
||||
}
|
||||
const int n_segments = whisper_full_n_segments(rw->context);
|
||||
VALUE output = rb_str_new2("");
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(rw->context, i);
|
||||
output = rb_str_concat(output, rb_str_new2(text));
|
||||
}
|
||||
rb_funcall(blk, id_call, 1, output);
|
||||
VALUE idCall = id_call;
|
||||
if (blk != Qnil) {
|
||||
rb_funcall(blk, idCall, 1, output);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
|
@ -1,288 +0,0 @@
|
||||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define DEFINE_PARAM(param_name, nth) \
|
||||
id_ ## param_name = rb_intern(#param_name); \
|
||||
param_names[nth] = id_ ## param_name; \
|
||||
rb_define_method(cVADParams, #param_name, ruby_whisper_vad_params_get_ ## param_name, 0); \
|
||||
rb_define_method(cVADParams, #param_name "=", ruby_whisper_vad_params_set_ ## param_name, 1);
|
||||
|
||||
#define NUM_PARAMS 6
|
||||
|
||||
extern VALUE cVADParams;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_params_memsize(const void *p)
|
||||
{
|
||||
const struct ruby_whisper_vad_params *params = p;
|
||||
size_t size = sizeof(params);
|
||||
if (!params) {
|
||||
return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static ID param_names[NUM_PARAMS];
|
||||
static ID id_threshold;
|
||||
static ID id_min_speech_duration_ms;
|
||||
static ID id_min_silence_duration_ms;
|
||||
static ID id_max_speech_duration_s;
|
||||
static ID id_speech_pad_ms;
|
||||
static ID id_samples_overlap;
|
||||
|
||||
const rb_data_type_t ruby_whisper_vad_params_type = {
|
||||
"ruby_whisper_vad_params",
|
||||
{0, 0, ruby_whisper_vad_params_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params = whisper_vad_default_params();
|
||||
return obj;
|
||||
}
|
||||
|
||||
/*
|
||||
* Probability threshold to consider as speech.
|
||||
*
|
||||
* call-seq:
|
||||
* threshold = th -> th
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_threshold(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.threshold = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_threshold(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.threshold);
|
||||
}
|
||||
|
||||
/*
|
||||
* Min duration for a valid speech segment.
|
||||
*
|
||||
* call-seq:
|
||||
* min_speech_duration_ms = duration_ms -> duration_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_min_speech_duration_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.min_speech_duration_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_min_speech_duration_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.min_speech_duration_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Min silence duration to consider speech as ended.
|
||||
*
|
||||
* call-seq:
|
||||
* min_silence_duration_ms = duration_ms -> duration_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_min_silence_duration_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.min_silence_duration_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_min_silence_duration_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.min_silence_duration_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Max duration of a speech segment before forcing a new segment.
|
||||
*
|
||||
* call-seq:
|
||||
* max_speech_duration_s = duration_s -> duration_s
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_max_speech_duration_s(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.max_speech_duration_s = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_max_speech_duration_s(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.max_speech_duration_s);
|
||||
}
|
||||
|
||||
/*
|
||||
* Padding added before and after speech segments.
|
||||
*
|
||||
* call-seq:
|
||||
* speech_pad_ms = pad_ms -> pad_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_speech_pad_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.speech_pad_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_speech_pad_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.speech_pad_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Overlap in seconds when copying audio samples from speech segment.
|
||||
*
|
||||
* call-seq:
|
||||
* samples_overlap = overlap -> overlap
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_samples_overlap(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.samples_overlap = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_samples_overlap(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.samples_overlap);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_equal(VALUE self, VALUE other)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp1;
|
||||
ruby_whisper_vad_params *rwvp2;
|
||||
|
||||
if (self == other) {
|
||||
return Qtrue;
|
||||
}
|
||||
|
||||
if (!rb_obj_is_kind_of(other, cVADParams)) {
|
||||
return Qfalse;
|
||||
}
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp1);
|
||||
TypedData_Get_Struct(other, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp2);
|
||||
|
||||
if (rwvp1->params.threshold != rwvp2->params.threshold) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.min_speech_duration_ms != rwvp2->params.min_speech_duration_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.min_silence_duration_ms != rwvp2->params.min_silence_duration_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.max_speech_duration_s != rwvp2->params.max_speech_duration_s) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.speech_pad_ms != rwvp2->params.speech_pad_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.samples_overlap != rwvp2->params.samples_overlap) {
|
||||
return Qfalse;
|
||||
}
|
||||
|
||||
return Qtrue;
|
||||
}
|
||||
|
||||
#define SET_PARAM_IF_SAME(param_name) \
|
||||
if (id == id_ ## param_name) { \
|
||||
ruby_whisper_vad_params_set_ ## param_name(self, value); \
|
||||
continue; \
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
VALUE kw_hash;
|
||||
VALUE values[NUM_PARAMS] = {Qundef};
|
||||
VALUE value;
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
ID id;
|
||||
int i;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
|
||||
if (NIL_P(kw_hash)) {
|
||||
return self;
|
||||
}
|
||||
|
||||
rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
|
||||
|
||||
for (i = 0; i < NUM_PARAMS; i++) {
|
||||
id = param_names[i];
|
||||
value = values[i];
|
||||
if (value == Qundef) {
|
||||
continue;
|
||||
}
|
||||
SET_PARAM_IF_SAME(threshold)
|
||||
SET_PARAM_IF_SAME(min_speech_duration_ms)
|
||||
SET_PARAM_IF_SAME(min_silence_duration_ms)
|
||||
SET_PARAM_IF_SAME(max_speech_duration_s)
|
||||
SET_PARAM_IF_SAME(speech_pad_ms)
|
||||
SET_PARAM_IF_SAME(samples_overlap)
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
#undef SET_PARAM_IF_SAME
|
||||
|
||||
void
|
||||
init_ruby_whisper_vad_params(VALUE *mVAD)
|
||||
{
|
||||
cVADParams = rb_define_class_under(*mVAD, "Params", rb_cObject);
|
||||
rb_define_alloc_func(cVADParams, ruby_whisper_vad_params_s_allocate);
|
||||
rb_define_method(cVADParams, "initialize", ruby_whisper_vad_params_initialize, -1);
|
||||
|
||||
DEFINE_PARAM(threshold, 0)
|
||||
DEFINE_PARAM(min_speech_duration_ms, 1)
|
||||
DEFINE_PARAM(min_silence_duration_ms, 2)
|
||||
DEFINE_PARAM(max_speech_duration_s, 3)
|
||||
DEFINE_PARAM(speech_pad_ms, 4)
|
||||
DEFINE_PARAM(samples_overlap, 5)
|
||||
|
||||
rb_define_method(cVADParams, "==", ruby_whisper_vad_params_equal, 1);
|
||||
}
|
||||
|
||||
#undef DEFINE_PARAM
|
||||
#undef NUM_PARAMS
|
@ -1,8 +0,0 @@
|
||||
set(GRAPHVIZ_EXECUTABLES FALSE)
|
||||
set(GRAPHVIZ_STATIC_LIBS TRUE)
|
||||
set(GRAPHVIZ_SHARED_LIBS FALSE)
|
||||
set(GRAPHVIZ_MODULE_LIBS FALSE)
|
||||
set(GRAPHVIZ_INTERFACE_LIBS FALSE)
|
||||
set(GRAPHVIZ_OBJECT_LIBS FALSE)
|
||||
set(GRAPHVIZ_UNKNOWN_LIBS FALSE)
|
||||
set(GRAPHVIZ_GENERATE_DEPENDERS FALSE)
|
@ -1,40 +1,6 @@
|
||||
require "pathname"
|
||||
require "yaml"
|
||||
|
||||
root = Pathname("..")/".."
|
||||
ignored_dirs = %w[
|
||||
.devops
|
||||
.github
|
||||
ci
|
||||
examples/wchess/wchess.wasm
|
||||
examples/whisper.android
|
||||
examples/whisper.android.java
|
||||
examples/whisper.objc
|
||||
examples/whisper.swiftui
|
||||
grammars
|
||||
models
|
||||
samples
|
||||
scripts
|
||||
].collect {|dir| root/dir}
|
||||
ignored_files = %w[
|
||||
AUTHORS
|
||||
Makefile
|
||||
README.md
|
||||
README_sycl.md
|
||||
.gitignore
|
||||
.gitmodules
|
||||
.dockerignore
|
||||
whisper.nvim
|
||||
twitch.sh
|
||||
yt-wsp.sh
|
||||
close-issue.yml
|
||||
]
|
||||
|
||||
EXTSOURCES =
|
||||
`git ls-files -z #{root}`.split("\x0")
|
||||
.collect {|file| Pathname(file)}
|
||||
.reject {|file|
|
||||
ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
|
||||
ignored_files.include?(file.basename.to_path) ||
|
||||
(file.descend.to_a[1] != root && file.descend.to_a[1] != Pathname("..")/"javascript")
|
||||
}
|
||||
.collect(&:to_path)
|
||||
sources = `git ls-files -z ../..`.split("\x0")
|
||||
paths = YAML.load_file("../../.github/workflows/bindings-ruby.yml")[true]["push"]["paths"]
|
||||
paths.delete "bindings/ruby/**"
|
||||
EXTSOURCES = (Dir.glob(paths, base: "../..").collect {|path| "../../#{path}"} << "../../LICENSE") & sources
|
||||
|
@ -1,15 +0,0 @@
|
||||
module Whisper
|
||||
class Context
|
||||
def to_srt
|
||||
each_segment.with_index.reduce("") {|srt, (segment, index)|
|
||||
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
|
||||
}
|
||||
end
|
||||
|
||||
def to_webvtt
|
||||
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
|
||||
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
@ -34,7 +34,7 @@ module Whisper
|
||||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? Pathname(ENV["XDG_CACHE_HOME"]) : Pathname(Dir.home)/".cache"
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
@ -53,10 +53,8 @@ module Whisper
|
||||
http.request request do |response|
|
||||
case response
|
||||
when Net::HTTPNotModified
|
||||
# noop
|
||||
# noop
|
||||
when Net::HTTPOK
|
||||
return if !response.key?("last-modified") && cache_path.exist?
|
||||
|
||||
download response
|
||||
when Net::HTTPRedirection
|
||||
request URI(response["location"]), headers
|
||||
@ -70,7 +68,7 @@ module Whisper
|
||||
rescue => err
|
||||
if cache_path.exist?
|
||||
warn err
|
||||
# Use cache file
|
||||
# Use cache file
|
||||
else
|
||||
raise
|
||||
end
|
||||
@ -130,44 +128,6 @@ module Whisper
|
||||
end
|
||||
end
|
||||
|
||||
class ZipURI < URI
|
||||
def cache
|
||||
zip_path = super
|
||||
dest = unzipped_path
|
||||
return if dest.exist? && dest.mtime >= zip_path.mtime
|
||||
escaping dest do
|
||||
system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true
|
||||
end
|
||||
zip_path
|
||||
end
|
||||
|
||||
def clear_cache
|
||||
super
|
||||
unzipped_path.rmtree if unzipped_path.exist?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def unzipped_path
|
||||
cache_path.sub_ext("")
|
||||
end
|
||||
|
||||
def escaping(path)
|
||||
escaped = Pathname("#{path}.removing")
|
||||
if path.exist?
|
||||
escaped.rmtree if escaped.exist?
|
||||
path.rename escaped
|
||||
end
|
||||
yield
|
||||
ensure
|
||||
if path.exist?
|
||||
escaped.rmtree if escaped.exist?
|
||||
else
|
||||
escaped.rename path if escaped.exist?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@pre_converted_models = %w[
|
||||
tiny
|
||||
tiny.en
|
||||
@ -203,31 +163,8 @@ module Whisper
|
||||
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
}
|
||||
|
||||
%w[
|
||||
silero-v5.1.2
|
||||
].each do |name|
|
||||
@pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
|
||||
end
|
||||
|
||||
@coreml_compiled_models = %w[
|
||||
tiny
|
||||
tiny.en
|
||||
base
|
||||
base.en
|
||||
small
|
||||
small.en
|
||||
medium
|
||||
medium.en
|
||||
large-v1
|
||||
large-v2
|
||||
large-v3
|
||||
large-v3-turbo
|
||||
].each_with_object({}) do |name, models|
|
||||
models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
|
||||
end
|
||||
|
||||
class << self
|
||||
attr_reader :pre_converted_models, :coreml_compiled_models
|
||||
attr_reader :pre_converted_models
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -1,58 +0,0 @@
|
||||
module Whisper
|
||||
class Segment
|
||||
SRT_ESCAPES = {
|
||||
"&" => "&",
|
||||
"<" => "<",
|
||||
">" => ">",
|
||||
}
|
||||
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
|
||||
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
|
||||
|
||||
def to_srt_cue
|
||||
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
|
||||
end
|
||||
|
||||
def to_webvtt_cue
|
||||
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def time_to_a(time)
|
||||
sec, decimal_part = time.divmod(1000)
|
||||
min, sec = sec.divmod(60)
|
||||
hour, min = min.divmod(60)
|
||||
[hour, min, sec, decimal_part]
|
||||
end
|
||||
|
||||
def srt_time(time)
|
||||
"%02d:%02d:%02d,%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def srt_start_time
|
||||
srt_time(start_time)
|
||||
end
|
||||
|
||||
def srt_end_time
|
||||
srt_time(end_time)
|
||||
end
|
||||
|
||||
def srt_text
|
||||
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
|
||||
end
|
||||
|
||||
def webvtt_time(time)
|
||||
"%02d:%02d:%02d.%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def webvtt_start_time
|
||||
webvtt_time(start_time)
|
||||
end
|
||||
|
||||
def webvtt_end_time
|
||||
webvtt_time(end_time)
|
||||
end
|
||||
|
||||
alias webvtt_text srt_text
|
||||
end
|
||||
end
|
@ -7,7 +7,6 @@ module Whisper
|
||||
type log_callback = ^(Integer level, String message, Object user_data) -> void
|
||||
type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
|
||||
type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
|
||||
type encoder_begin_callback = ^(Whisper::Context, void, Object user_data) -> void
|
||||
type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
|
||||
|
||||
LOG_LEVEL_NONE: Integer
|
||||
@ -22,23 +21,11 @@ module Whisper
|
||||
def self.lang_str: (Integer id) -> String
|
||||
def self.lang_str_full: (Integer id) -> String
|
||||
def self.log_set: (log_callback, Object? user_data) -> log_callback
|
||||
def self.system_info_str: () -> String
|
||||
|
||||
class Context
|
||||
def self.new: (String | path | ::URI::HTTP) -> instance
|
||||
|
||||
# transcribe a single file
|
||||
# can emit to a block results
|
||||
#
|
||||
# params = Whisper::Params.new
|
||||
# params.duration = 60_000
|
||||
# whisper.transcribe "path/to/audio.wav", params do |text|
|
||||
# puts text
|
||||
# end
|
||||
#
|
||||
def transcribe: (string, Params, ?n_processors: Integer) -> self
|
||||
| (string, Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||
|
||||
def self.new: (string | _ToPath | ::URI::HTTP) -> instance
|
||||
def transcribe: (string, Params) -> self
|
||||
| (string, Params) { (String) -> void } -> self
|
||||
def model_n_vocab: () -> Integer
|
||||
def model_n_audio_ctx: () -> Integer
|
||||
def model_n_audio_state: () -> Integer
|
||||
@ -47,78 +34,22 @@ module Whisper
|
||||
def model_n_mels: () -> Integer
|
||||
def model_ftype: () -> Integer
|
||||
def model_type: () -> String
|
||||
|
||||
# Yields each Whisper::Segment:
|
||||
#
|
||||
# whisper.transcribe("path/to/audio.wav", params)
|
||||
# whisper.each_segment do |segment|
|
||||
# puts segment.text
|
||||
# end
|
||||
#
|
||||
# Returns an Enumerator if no block given:
|
||||
#
|
||||
# whisper.transcribe("path/to/audio.wav", params)
|
||||
# enum = whisper.each_segment
|
||||
# enum.to_a # => [#<Whisper::Segment>, ...]
|
||||
#
|
||||
def each_segment: { (Segment) -> void } -> void
|
||||
| () -> Enumerator[Segment]
|
||||
|
||||
def model: () -> Model
|
||||
def full_get_segment: (Integer nth) -> Segment
|
||||
def full_n_segments: () -> Integer
|
||||
|
||||
# Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
|
||||
#
|
||||
def full_lang_id: () -> Integer
|
||||
|
||||
# Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
|
||||
#
|
||||
# full_get_segment_t0(3) # => 1668 (16680 ms)
|
||||
#
|
||||
def full_get_segment_t0: (Integer) -> Integer
|
||||
|
||||
# End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
|
||||
#
|
||||
# full_get_segment_t1(3) # => 1668 (16680 ms)
|
||||
#
|
||||
def full_get_segment_t1: (Integer) -> Integer
|
||||
|
||||
# Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
|
||||
#
|
||||
# full_get_segment_speacker_turn_next(3) # => true
|
||||
#
|
||||
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
|
||||
|
||||
# Text of a segment indexed by +segment_index+.
|
||||
#
|
||||
# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
|
||||
#
|
||||
def full_get_segment_text: (Integer) -> String
|
||||
|
||||
def full_get_segment_no_speech_prob: (Integer) -> Float
|
||||
|
||||
# Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
# Not thread safe for same context
|
||||
# Uses the specified decoding strategy to obtain the text.
|
||||
#
|
||||
# The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
|
||||
#
|
||||
def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
|
||||
# Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
# Result is stored in the default state of the context
|
||||
# Not thread safe if executed in parallel on the same context.
|
||||
# It seems this approach can offer some speedup in some cases.
|
||||
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
#
|
||||
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||
|
||||
def to_srt: () -> String
|
||||
def to_webvtt: () -> String
|
||||
end
|
||||
|
||||
class Params
|
||||
@ -151,246 +82,76 @@ module Whisper
|
||||
?new_segment_callback_user_data: Object,
|
||||
?progress_callback: progress_callback,
|
||||
?progress_callback_user_data: Object,
|
||||
?encoder_begin_callback: encoder_begin_callback,
|
||||
?encoder_begin_callback_user_data: Object,
|
||||
?abort_callback: abort_callback,
|
||||
?abort_callback_user_data: Object,
|
||||
?vad: boolish,
|
||||
?vad_model_path: path | URI,
|
||||
?vad_params: Whisper::VAD::Params
|
||||
?abort_callback_user_data: Object
|
||||
) -> instance
|
||||
|
||||
# params.language = "auto" | "en", etc...
|
||||
#
|
||||
def language=: (String) -> String # TODO: Enumerate lang names
|
||||
|
||||
def language: () -> String
|
||||
def translate=: (boolish) -> boolish
|
||||
def translate: () -> (true | false)
|
||||
def no_context=: (boolish) -> boolish
|
||||
|
||||
# If true, does not use past transcription (if any) as initial prompt for the decoder.
|
||||
#
|
||||
def no_context: () -> (true | false)
|
||||
|
||||
def single_segment=: (boolish) -> boolish
|
||||
|
||||
# If true, forces single segment output (useful for streaming).
|
||||
#
|
||||
def single_segment: () -> (true | false)
|
||||
|
||||
def print_special=: (boolish) -> boolish
|
||||
|
||||
# If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
|
||||
#
|
||||
def print_special: () -> (true | false)
|
||||
|
||||
def print_progress=: (boolish) -> boolish
|
||||
|
||||
# If true, prints progress information.
|
||||
#
|
||||
def print_progress: () -> (true | false)
|
||||
|
||||
def print_realtime=: (boolish) -> boolish
|
||||
|
||||
# If true, prints results from within whisper.cpp. (avoid it, use callback instead)
|
||||
#
|
||||
def print_realtime: () -> (true | false)
|
||||
|
||||
# If true, prints timestamps for each text segment when printing realtime.
|
||||
#
|
||||
def print_timestamps=: (boolish) -> boolish
|
||||
|
||||
def print_timestamps: () -> (true | false)
|
||||
|
||||
def suppress_blank=: (boolish) -> boolish
|
||||
|
||||
# If true, suppresses blank outputs.
|
||||
#
|
||||
def suppress_blank: () -> (true | false)
|
||||
|
||||
def suppress_nst=: (boolish) -> boolish
|
||||
|
||||
# If true, suppresses non-speech-tokens.
|
||||
#
|
||||
def suppress_nst: () -> (true | false)
|
||||
|
||||
def token_timestamps=: (boolish) -> boolish
|
||||
|
||||
# If true, enables token-level timestamps.
|
||||
#
|
||||
def token_timestamps: () -> (true | false)
|
||||
|
||||
def split_on_word=: (boolish) -> boolish
|
||||
|
||||
# If true, split on word rather than on token (when used with max_len).
|
||||
#
|
||||
def split_on_word: () -> (true | false)
|
||||
|
||||
def initial_prompt=: (_ToS) -> _ToS
|
||||
|
||||
# Tokens to provide to the whisper decoder as initial prompt
|
||||
# these are prepended to any existing text context from a previous call
|
||||
# use whisper_tokenize() to convert text to tokens.
|
||||
# Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
|
||||
#
|
||||
def initial_prompt: () -> (String | nil)
|
||||
|
||||
def diarize=: (boolish) -> boolish
|
||||
|
||||
# If true, enables diarization.
|
||||
#
|
||||
def diarize: () -> (true | false)
|
||||
|
||||
def offset=: (Integer) -> Integer
|
||||
|
||||
# Start offset in ms.
|
||||
#
|
||||
def offset: () -> Integer
|
||||
|
||||
def duration=: (Integer) -> Integer
|
||||
|
||||
# Audio duration to process in ms.
|
||||
#
|
||||
def duration: () -> Integer
|
||||
|
||||
def max_text_tokens=: (Integer) -> Integer
|
||||
|
||||
# Max tokens to use from past text as prompt for the decoder.
|
||||
#
|
||||
def max_text_tokens: () -> Integer
|
||||
|
||||
def temperature=: (Float) -> Float
|
||||
def temperature: () -> Float
|
||||
def max_initial_ts=: (Float) -> Float
|
||||
|
||||
# See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
||||
#
|
||||
def max_initial_ts: () -> Float
|
||||
|
||||
def length_penalty=: (Float) -> Float
|
||||
def length_penalty: () -> Float
|
||||
def temperature_inc=: (Float) -> Float
|
||||
def temperature_inc: () -> Float
|
||||
def entropy_thold=: (Float) -> Float
|
||||
|
||||
# Similar to OpenAI's "compression_ratio_threshold"
|
||||
#
|
||||
def entropy_thold: () -> Float
|
||||
|
||||
def logprob_thold=: (Float) -> Float
|
||||
def logprob_thold: () -> Float
|
||||
def no_speech_thold=: (Float) -> Float
|
||||
def no_speech_thold: () -> Float
|
||||
|
||||
# Sets new segment callback, called for every newly generated text segment.
|
||||
#
|
||||
# params.new_segment_callback = ->(context, _, n_new, user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
|
||||
def new_segment_callback: () -> (new_segment_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of new segment callback.
|
||||
#
|
||||
def new_segment_callback_user_data=: (Object) -> Object
|
||||
|
||||
def new_segment_callback_user_data: () -> Object
|
||||
|
||||
# Sets progress callback, called on each progress update.
|
||||
#
|
||||
# params.new_segment_callback = ->(context, _, progress, user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
# +progress+ is an Integer between 0 and 100.
|
||||
#
|
||||
def progress_callback=: (progress_callback) -> progress_callback
|
||||
|
||||
def progress_callback: () -> (progress_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of progress callback.
|
||||
#
|
||||
def progress_callback_user_data=: (Object) -> Object
|
||||
|
||||
def progress_callback_user_data: () -> Object
|
||||
|
||||
# Sets encoder begin callback, called when the encoder starts.
|
||||
#
|
||||
def encoder_begin_callback=: (encoder_begin_callback) -> encoder_begin_callback
|
||||
|
||||
def encoder_begin_callback: () -> (encoder_begin_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of encoder begin callback.
|
||||
#
|
||||
def encoder_begin_callback_user_data=: (Object) -> Object
|
||||
|
||||
def encoder_begin_callback_user_data: () -> Object
|
||||
|
||||
# Sets abort callback, called to check if the process should be aborted.
|
||||
#
|
||||
# params.abort_callback = ->(user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
#
|
||||
def abort_callback=: (abort_callback) -> abort_callback
|
||||
|
||||
def abort_callback: () -> (abort_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of abort callback.
|
||||
#
|
||||
def abort_callback_user_data=: (Object) -> Object
|
||||
|
||||
def abort_callback_user_data: () -> Object
|
||||
|
||||
# Enable VAD
|
||||
#
|
||||
def vad=: (boolish) -> boolish
|
||||
|
||||
def vad: () -> (true | false)
|
||||
|
||||
# Path to the VAD model
|
||||
def vad_model_path=: (path | URI | nil) -> (path | URI | nil)
|
||||
|
||||
def vad_model_path: () -> (String | nil)
|
||||
|
||||
def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params
|
||||
def vad_params: () -> (Whisper::VAD::Params)
|
||||
|
||||
# Hook called on new segment. Yields each Whisper::Segment.
|
||||
#
|
||||
# whisper.on_new_segment do |segment|
|
||||
# # ...
|
||||
# end
|
||||
#
|
||||
def on_new_segment: { (Segment) -> void } -> void
|
||||
|
||||
# Hook called on progress update. Yields each progress Integer between 0 and 100.
|
||||
#
|
||||
def on_progress: { (Integer progress) -> void } -> void
|
||||
|
||||
# Hook called on encoder starts.
|
||||
#
|
||||
def on_encoder_begin: { () -> void } -> void
|
||||
|
||||
# Call block to determine whether abort or not. Return +true+ when you want to abort.
|
||||
#
|
||||
# params.abort_on do
|
||||
# if some_condition
|
||||
# true # abort
|
||||
# else
|
||||
# false # continue
|
||||
# end
|
||||
# end
|
||||
#
|
||||
def abort_on: { (Object user_data) -> boolish } -> void
|
||||
end
|
||||
|
||||
class Model
|
||||
def self.pre_converted_models: () -> Hash[String, Model::URI]
|
||||
def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI]
|
||||
def self.new: () -> instance
|
||||
def n_vocab: () -> Integer
|
||||
def n_audio_ctx: () -> Integer
|
||||
@ -406,99 +167,18 @@ module Whisper
|
||||
def type: () -> String
|
||||
|
||||
class URI
|
||||
def self.new: (string | ::URI::HTTP) -> instance
|
||||
def self.new: (string | ::URI::HTTP) -> self
|
||||
def to_path: -> String
|
||||
def clear_cache: -> void
|
||||
end
|
||||
|
||||
class ZipURI < URI
|
||||
def cache: () -> Pathname
|
||||
def clear_cache: () -> void
|
||||
end
|
||||
end
|
||||
|
||||
class Segment
|
||||
type deconstructed_keys = {
|
||||
start_time: (Integer | nil),
|
||||
end_time: (Integer | nil),
|
||||
text: (String | nil),
|
||||
no_speech_prob: (Float | nil),
|
||||
speaker_turn_next: (true | false | nil)
|
||||
}
|
||||
|
||||
# Start time in milliseconds.
|
||||
#
|
||||
def start_time: () -> Integer
|
||||
|
||||
# End time in milliseconds.
|
||||
#
|
||||
def end_time: () -> Integer
|
||||
|
||||
# Whether the next segment is predicted as a speaker turn.
|
||||
def speaker_turn_next?: () -> (true | false)
|
||||
|
||||
def speaker_next_turn?: () -> (true | false)
|
||||
def text: () -> String
|
||||
def no_speech_prob: () -> Float
|
||||
def to_srt_cue: () -> String
|
||||
def to_webvtt_cue: () -> String
|
||||
|
||||
# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
||||
#
|
||||
# whisper.each_segment do |segment|
|
||||
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
#
|
||||
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
# end
|
||||
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
|
||||
end
|
||||
|
||||
module VAD
|
||||
class Params
|
||||
def self.new: (
|
||||
?threshold: Float,
|
||||
?min_speech_duration_ms: Integer,
|
||||
?min_silence_duration_ms: Integer,
|
||||
?max_speech_duration_s: Float,
|
||||
?speech_pad_ms: Integer,
|
||||
?samples_overlap: Float
|
||||
) -> instance
|
||||
|
||||
# Probability threshold to consider as speech.
|
||||
#
|
||||
def threshold=: (Float) -> Float
|
||||
|
||||
def threshold: () -> Float
|
||||
|
||||
# Min duration for a valid speech segment.
|
||||
#
|
||||
def min_speech_duration_ms=: (Integer) -> Integer
|
||||
|
||||
def min_speech_duration_ms: () -> Integer
|
||||
|
||||
# Min silence duration to consider speech as ended.
|
||||
#
|
||||
def min_silence_duration_ms=: (Integer) -> Integer
|
||||
|
||||
def min_silence_duration_ms: () -> Integer
|
||||
|
||||
# Max duration of a speech segment before forcing a new segment.
|
||||
def max_speech_duration_s=: (Float) -> Float
|
||||
|
||||
def max_speech_duration_s: () -> Float
|
||||
|
||||
# Padding added before and after speech segments.
|
||||
#
|
||||
def speech_pad_ms=: (Integer) -> Integer
|
||||
|
||||
def speech_pad_ms: () -> Integer
|
||||
|
||||
# Overlap in seconds when copying audio samples from speech segment.
|
||||
#
|
||||
def samples_overlap=: (Float) -> Float
|
||||
|
||||
def samples_overlap: () -> Float
|
||||
def ==: (Params) -> (true | false)
|
||||
end
|
||||
end
|
||||
|
||||
class Error < StandardError
|
||||
|
@ -1,50 +0,0 @@
|
||||
require_relative "helper"
|
||||
require 'tempfile'
|
||||
require 'tmpdir'
|
||||
require 'shellwords'
|
||||
|
||||
class TestPackage < TestBase
|
||||
def test_build
|
||||
Tempfile.create do |file|
|
||||
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
|
||||
assert file.size > 0
|
||||
assert_path_exist file.to_path
|
||||
end
|
||||
end
|
||||
|
||||
sub_test_case "Building binary on installation" do
|
||||
def setup
|
||||
system "rake", "build", exception: true
|
||||
end
|
||||
|
||||
def test_install
|
||||
gemspec = Gem::Specification.load("whispercpp.gemspec")
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", exception: true
|
||||
assert_installed dir, gemspec.version
|
||||
end
|
||||
end
|
||||
|
||||
def test_install_with_coreml
|
||||
omit_unless RUBY_PLATFORM.match?(/darwin/) do
|
||||
gemspec = Gem::Specification.load("whispercpp.gemspec")
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true
|
||||
assert_installed dir, gemspec.version
|
||||
assert_nothing_raised do
|
||||
libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib")
|
||||
system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def assert_installed(dir, version)
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", "whisper.#{RbConfig::CONFIG["DLEXT"]}")
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/LICENSE")
|
||||
assert_path_not_exist File.join(dir, "gems/whispercpp-#{version}/ext/build")
|
||||
end
|
||||
end
|
||||
end
|
@ -1,136 +0,0 @@
|
||||
require_relative "helper"
|
||||
|
||||
class TestSegment < TestBase
|
||||
def test_iteration
|
||||
whisper.each_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
end
|
||||
end
|
||||
|
||||
def test_enumerator
|
||||
enum = whisper.each_segment
|
||||
assert_instance_of Enumerator, enum
|
||||
enum.to_a.each_with_index do |segment, index|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
assert_kind_of Integer, index
|
||||
end
|
||||
end
|
||||
|
||||
def test_start_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal 0, segment.start_time if i == 0
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_end_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_no_speech_prob
|
||||
no_speech_prob = nil
|
||||
whisper.each_segment do |segment|
|
||||
no_speech_prob = segment.no_speech_prob
|
||||
end
|
||||
assert no_speech_prob > 0.0
|
||||
end
|
||||
|
||||
def test_on_new_segment
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
index = 0
|
||||
params.on_new_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
if index == 0
|
||||
seg = segment
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
end
|
||||
index += 1
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
assert_equal 0, seg.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
|
||||
end
|
||||
|
||||
def test_on_new_segment_twice
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
params.on_new_segment do |segment|
|
||||
seg = segment
|
||||
return
|
||||
end
|
||||
params.on_new_segment do |segment|
|
||||
assert_same seg, segment
|
||||
return
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
end
|
||||
|
||||
def test_pattern_matching
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
assert_equal segment.no_speech_prob, no_speech_prob
|
||||
assert_equal segment.speaker_turn_next?, speaker_turn_next
|
||||
end
|
||||
|
||||
def test_pattern_matching_partial
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
end
|
||||
|
||||
def test_deconstruct_keys
|
||||
segment = whisper.each_segment.first
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob,
|
||||
speaker_turn_next: segment.speaker_turn_next?
|
||||
}
|
||||
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_non_existent
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:non_existent]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_too_many_keys
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_includes_non_existent_keys_not_too_many
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob
|
||||
}
|
||||
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
|
||||
end
|
||||
end
|
@ -1,19 +0,0 @@
|
||||
require_relative "helper"
|
||||
|
||||
class TestVAD < TestBase
|
||||
def setup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
vad_params = Whisper::VAD::Params.new
|
||||
@params = Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v5.1.2",
|
||||
vad_params:
|
||||
)
|
||||
end
|
||||
|
||||
def test_transcribe
|
||||
@whisper.transcribe(TestBase::AUDIO, @params) do |text|
|
||||
assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
|
||||
end
|
||||
end
|
||||
end
|
@ -1,103 +0,0 @@
|
||||
require_relative "helper"
|
||||
|
||||
class TestVADParams < TestBase
|
||||
PARAM_NAMES = [
|
||||
:threshold,
|
||||
:min_speech_duration_ms,
|
||||
:min_silence_duration_ms,
|
||||
:max_speech_duration_s,
|
||||
:speech_pad_ms,
|
||||
:samples_overlap
|
||||
]
|
||||
|
||||
def setup
|
||||
@params = Whisper::VAD::Params.new
|
||||
end
|
||||
|
||||
def test_new
|
||||
params = Whisper::VAD::Params.new
|
||||
assert_kind_of Whisper::VAD::Params, params
|
||||
end
|
||||
|
||||
def test_threshold
|
||||
assert_in_delta @params.threshold, 0.5
|
||||
@params.threshold = 0.7
|
||||
assert_in_delta @params.threshold, 0.7
|
||||
end
|
||||
|
||||
def test_min_speech_duration
|
||||
pend
|
||||
end
|
||||
|
||||
def test_min_speech_duration_ms
|
||||
assert_equal 250, @params.min_speech_duration_ms
|
||||
@params.min_speech_duration_ms = 500
|
||||
assert_equal 500, @params.min_speech_duration_ms
|
||||
end
|
||||
|
||||
def test_min_silence_duration_ms
|
||||
assert_equal 100, @params.min_silence_duration_ms
|
||||
@params.min_silence_duration_ms = 200
|
||||
assert_equal 200, @params.min_silence_duration_ms
|
||||
end
|
||||
|
||||
def test_max_speech_duration
|
||||
pend
|
||||
end
|
||||
|
||||
def test_max_speech_duration_s
|
||||
assert @params.max_speech_duration_s >= 10e37 # Defaults to FLT_MAX
|
||||
@params.max_speech_duration_s = 60.0
|
||||
assert_equal 60.0, @params.max_speech_duration_s
|
||||
end
|
||||
|
||||
def test_speech_pad_ms
|
||||
assert_equal 30, @params.speech_pad_ms
|
||||
@params.speech_pad_ms = 50
|
||||
assert_equal 50, @params.speech_pad_ms
|
||||
end
|
||||
|
||||
def test_samples_overlap
|
||||
assert_in_delta @params.samples_overlap, 0.1
|
||||
@params.samples_overlap = 0.5
|
||||
assert_in_delta @params.samples_overlap, 0.5
|
||||
end
|
||||
|
||||
def test_equal
|
||||
assert_equal @params, Whisper::VAD::Params.new
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::VAD::Params.new(threshold: 0.7)
|
||||
assert_in_delta params.threshold, 0.7
|
||||
assert_equal 250, params.min_speech_duration_ms
|
||||
end
|
||||
|
||||
def test_new_with_kw_args_non_existent
|
||||
assert_raise ArgumentError do
|
||||
Whisper::VAD::Params.new(non_existent: "value")
|
||||
end
|
||||
end
|
||||
|
||||
data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
|
||||
def test_new_with_kw_args_default_values(param)
|
||||
default_value = @params.send(param)
|
||||
value = default_value + 1
|
||||
params = Whisper::VAD::Params.new(param => value)
|
||||
if Float === value
|
||||
assert_in_delta value, params.send(param)
|
||||
else
|
||||
assert_equal value, params.send(param)
|
||||
end
|
||||
|
||||
PARAM_NAMES.reject {|name| name == param}.each do |name|
|
||||
expected = @params.send(name)
|
||||
actual = params.send(name)
|
||||
if Float === expected
|
||||
assert_in_delta expected, actual
|
||||
else
|
||||
assert_equal expected, actual
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
@ -3,12 +3,12 @@ require "whisper"
|
||||
require_relative "jfk_reader/jfk_reader"
|
||||
|
||||
class TestBase < Test::Unit::TestCase
|
||||
AUDIO = File.join(__dir__, "fixtures", "jfk.wav")
|
||||
AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
|
||||
|
||||
class << self
|
||||
def whisper
|
||||
return @whisper if @whisper
|
||||
attr_reader :whisper
|
||||
|
||||
def startup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
params.print_timestamps = false
|
@ -25,7 +25,7 @@ class TestCallback < TestBase
|
||||
assert start_time >= 0
|
||||
assert_kind_of Integer, end_time
|
||||
assert end_time > 0
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text) if i_segment == 0
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
|
||||
end
|
||||
}
|
||||
|
||||
@ -111,48 +111,6 @@ class TestCallback < TestBase
|
||||
assert_equal 100, last
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback
|
||||
i = 0
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
i += 1
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert i > 0
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback_abort
|
||||
logs = []
|
||||
Whisper.log_set -> (level, buffer, user_data) {
|
||||
logs << buffer if level == Whisper::LOG_LEVEL_ERROR
|
||||
}, logs
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
return false
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert_match(/encoder_begin_callback returned false - aborting/, logs.join)
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback_user_data
|
||||
udata = Object.new
|
||||
@params.encoder_begin_callback_user_data = udata
|
||||
yielded = nil
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
yielded = user_data
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert_same udata, yielded
|
||||
end
|
||||
|
||||
def test_on_encoder_begin
|
||||
i = 0
|
||||
@params.on_encoder_begin do
|
||||
i += 1
|
||||
end
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert i > 0
|
||||
end
|
||||
|
||||
def test_abort_callback
|
||||
i = 0
|
||||
@params.abort_callback = ->(user_data) {
|
||||
@ -187,9 +145,9 @@ class TestCallback < TestBase
|
||||
|
||||
def test_abort_on
|
||||
do_abort = false
|
||||
_aborted_from_callback = false
|
||||
aborted_from_callback = false
|
||||
@params.on_new_segment do |segment|
|
||||
do_abort = true if segment.text.match?(/ask/)
|
||||
do_abort = true if segment.text.match? /ask/
|
||||
end
|
||||
i = 0
|
||||
@params.abort_on do
|
@ -4,7 +4,7 @@ class TestError < TestBase
|
||||
def test_error
|
||||
error = Whisper::Error.new(-2)
|
||||
assert_equal "failed to compute log mel spectrogram", error.message
|
||||
assert_equal(-2, error.code)
|
||||
assert_equal -2, error.code
|
||||
end
|
||||
|
||||
def test_unknown_error
|
||||
@ -14,7 +14,7 @@ class TestError < TestBase
|
||||
|
||||
def test_non_int_code
|
||||
assert_raise TypeError do
|
||||
_error = Whisper::Error.new("non int")
|
||||
error = Whisper::Error.new("non int")
|
||||
end
|
||||
end
|
||||
end
|
@ -106,13 +106,4 @@ class TestModel < TestBase
|
||||
assert_equal 1, model.ftype
|
||||
assert_equal "base", model.type
|
||||
end
|
||||
|
||||
def test_coreml_model_auto_download
|
||||
uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]]
|
||||
model_path = Pathname(uri.to_path).sub_ext("")
|
||||
model_path.rmtree if model_path.exist?
|
||||
|
||||
uri.cache
|
||||
assert_path_exist model_path
|
||||
end
|
||||
end
|
31
bindings/ruby/tests/test_package.rb
Normal file
31
bindings/ruby/tests/test_package.rb
Normal file
@ -0,0 +1,31 @@
|
||||
require_relative "helper"
|
||||
require 'tempfile'
|
||||
require 'tmpdir'
|
||||
require 'shellwords'
|
||||
|
||||
class TestPackage < TestBase
|
||||
def test_build
|
||||
Tempfile.create do |file|
|
||||
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
|
||||
assert file.size > 0
|
||||
assert_path_exist file.to_path
|
||||
end
|
||||
end
|
||||
|
||||
sub_test_case "Building binary on installation" do
|
||||
def setup
|
||||
system "rake", "build", exception: true
|
||||
end
|
||||
|
||||
def test_install
|
||||
match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
|
||||
filename = match_data[1]
|
||||
version = match_data[2]
|
||||
basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{filename.shellescape}", exception: true
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
@ -32,9 +32,6 @@ class TestParams < TestBase
|
||||
:progress_callback_user_data,
|
||||
:abort_callback,
|
||||
:abort_callback_user_data,
|
||||
:vad,
|
||||
:vad_model_path,
|
||||
:vad_params,
|
||||
]
|
||||
|
||||
def setup
|
||||
@ -165,7 +162,7 @@ class TestParams < TestBase
|
||||
end
|
||||
|
||||
def test_length_penalty
|
||||
assert_equal(-1.0, @params.length_penalty)
|
||||
assert_equal -1.0, @params.length_penalty
|
||||
@params.length_penalty = 0.5
|
||||
assert_equal 0.5, @params.length_penalty
|
||||
end
|
||||
@ -183,9 +180,9 @@ class TestParams < TestBase
|
||||
end
|
||||
|
||||
def test_logprob_thold
|
||||
assert_in_delta(-1.0, @params.logprob_thold)
|
||||
assert_in_delta -1.0, @params.logprob_thold
|
||||
@params.logprob_thold = -0.5
|
||||
assert_in_delta(-0.5, @params.logprob_thold)
|
||||
assert_in_delta -0.5, @params.logprob_thold
|
||||
end
|
||||
|
||||
def test_no_speech_thold
|
||||
@ -194,50 +191,6 @@ class TestParams < TestBase
|
||||
assert_in_delta 0.2, @params.no_speech_thold
|
||||
end
|
||||
|
||||
def test_vad
|
||||
assert_false @params.vad
|
||||
@params.vad = true
|
||||
assert_true @params.vad
|
||||
end
|
||||
|
||||
def test_vad_model_path
|
||||
assert_nil @params.vad_model_path
|
||||
@params.vad_model_path = "silero-v5.1.2"
|
||||
assert_equal Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path, @params.vad_model_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_nil
|
||||
@params.vad_model_path = "silero-v5.1.2"
|
||||
@params.vad_model_path = nil
|
||||
assert_nil @params.vad_model_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_invalid
|
||||
assert_raise TypeError do
|
||||
@params.vad_model_path = Object.new
|
||||
end
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_URI_string
|
||||
@params.vad_model_path = "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin"
|
||||
assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_URI
|
||||
@params.vad_model_path = URI("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin")
|
||||
assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
|
||||
end
|
||||
|
||||
def test_vad_params
|
||||
assert_kind_of Whisper::VAD::Params, @params.vad_params
|
||||
default_params = @params.vad_params
|
||||
assert_same default_params, @params.vad_params
|
||||
assert_equal 0.5, default_params.threshold
|
||||
new_params = Whisper::VAD::Params.new
|
||||
@params.vad_params = new_params
|
||||
assert_same new_params, @params.vad_params
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::Params.new(language: "es")
|
||||
assert_equal "es", params.language
|
||||
@ -272,10 +225,6 @@ class TestParams < TestBase
|
||||
proc {}
|
||||
in [/_user_data\Z/, *]
|
||||
Object.new
|
||||
in [:vad_model_path, *]
|
||||
Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
|
||||
in [:vad_params, *]
|
||||
Whisper::VAD::Params.new
|
||||
end
|
||||
params = Whisper::Params.new(param => value)
|
||||
if Float === value
|
74
bindings/ruby/tests/test_segment.rb
Normal file
74
bindings/ruby/tests/test_segment.rb
Normal file
@ -0,0 +1,74 @@
|
||||
require_relative "helper"
|
||||
|
||||
class TestSegment < TestBase
|
||||
def test_iteration
|
||||
whisper.each_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
end
|
||||
end
|
||||
|
||||
def test_enumerator
|
||||
enum = whisper.each_segment
|
||||
assert_instance_of Enumerator, enum
|
||||
enum.to_a.each_with_index do |segment, index|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
assert_kind_of Integer, index
|
||||
end
|
||||
end
|
||||
|
||||
def test_start_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal 0, segment.start_time if i == 0
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_end_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_no_speech_prob
|
||||
no_speech_prob = nil
|
||||
whisper.each_segment do |segment|
|
||||
no_speech_prob = segment.no_speech_prob
|
||||
end
|
||||
assert no_speech_prob > 0.0
|
||||
end
|
||||
|
||||
def test_on_new_segment
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
index = 0
|
||||
params.on_new_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
if index == 0
|
||||
seg = segment
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
|
||||
end
|
||||
index += 1
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
assert_equal 0, seg.start_time
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
|
||||
end
|
||||
|
||||
def test_on_new_segment_twice
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
params.on_new_segment do |segment|
|
||||
seg = segment
|
||||
return
|
||||
end
|
||||
params.on_new_segment do |segment|
|
||||
assert_same seg, segment
|
||||
return
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
end
|
||||
end
|
@ -16,25 +16,7 @@ class TestWhisper < TestBase
|
||||
params.print_timestamps = false
|
||||
|
||||
@whisper.transcribe(AUDIO, params) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
}
|
||||
end
|
||||
|
||||
def test_transcribe_non_parallel
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
}
|
||||
end
|
||||
|
||||
def test_transcribe_n_processors
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
||||
assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, text
|
||||
}
|
||||
end
|
||||
|
||||
@ -50,7 +32,7 @@ class TestWhisper < TestBase
|
||||
def test_full_get_segment
|
||||
segment = whisper.full_get_segment(0)
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
|
||||
end
|
||||
|
||||
def test_full_get_segment_t0
|
||||
@ -77,7 +59,7 @@ class TestWhisper < TestBase
|
||||
end
|
||||
|
||||
def test_full_get_segment_text
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0))
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
|
||||
end
|
||||
|
||||
def test_full_get_segment_no_speech_prob
|
||||
@ -112,10 +94,6 @@ class TestWhisper < TestBase
|
||||
end
|
||||
end
|
||||
|
||||
def test_system_info_str
|
||||
assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
|
||||
end
|
||||
|
||||
def test_log_set
|
||||
user_data = Object.new
|
||||
logs = []
|
||||
@ -156,14 +134,14 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, @samples, @samples.length)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_without_length
|
||||
@whisper.full(@params, @samples)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_enumerator
|
||||
@ -171,7 +149,7 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, samples, @samples.length)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_enumerator_without_length
|
||||
@ -193,28 +171,26 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, samples)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_parallel
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_with_memory_view
|
||||
nprocessors = 2
|
||||
samples = JFKReader.new(AUDIO)
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
@whisper.full_parallel(@params, samples, nil, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_length_and_n_processors
|
||||
@ -222,18 +198,17 @@ class TestWhisper < TestBase
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_length
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
@whisper.full_parallel(@params, @samples, nil, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_n_processors
|
||||
@ -241,52 +216,8 @@ class TestWhisper < TestBase
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_srt.lines
|
||||
assert_match(/\A\d+\n/, lines[0])
|
||||
assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
|
||||
end
|
||||
|
||||
def test_to_webvtt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_webvtt.lines
|
||||
assert_equal "WEBVTT\n", lines[0]
|
||||
assert_equal "\n", lines[1]
|
||||
assert_match(/\A\d+\n/, lines[2])
|
||||
assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
|
||||
end
|
||||
|
||||
sub_test_case "Format needs escape" do
|
||||
def setup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
@whisper.transcribe AUDIO, Whisper::Params.new
|
||||
segment = @whisper.each_segment.first
|
||||
segment.define_singleton_method :text do
|
||||
"& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
|
||||
end
|
||||
@whisper.define_singleton_method :each_segment do
|
||||
Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
|
||||
end
|
||||
|
||||
def test_to_webvtt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
end
|
||||
end
|
@ -3,7 +3,8 @@ require_relative "extsources"
|
||||
Gem::Specification.new do |s|
|
||||
s.name = "whispercpp"
|
||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||
s.version = '1.3.3'
|
||||
s.version = '1.3.1'
|
||||
s.date = '2024-12-19'
|
||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||
s.email = 'todd.fisher@gmail.com'
|
||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||
@ -14,19 +15,18 @@ Gem::Specification.new do |s|
|
||||
if s.extra_rdoc_files.include?(basename)
|
||||
basename
|
||||
else
|
||||
file.sub("../..", "ext/sources")
|
||||
.sub("../javascript", "ext/sources/bindings/javascript")
|
||||
file.sub("../..", "ext")
|
||||
end
|
||||
}
|
||||
|
||||
s.summary = %q{Ruby whisper.cpp bindings}
|
||||
s.test_files = s.files.select {|file| file.start_with? "test/"}
|
||||
s.test_files = s.files.select {|file| file.start_with? "tests/"}
|
||||
|
||||
s.extensions << 'ext/extconf.rb'
|
||||
s.required_ruby_version = '>= 3.1.0'
|
||||
|
||||
#### Documentation and testing.
|
||||
s.homepage = 'https://github.com/ggml-org/whisper.cpp'
|
||||
s.homepage = 'https://github.com/ggerganov/whisper.cpp'
|
||||
s.rdoc_options = ['--main', 'README.md']
|
||||
|
||||
|
||||
|
@ -1,547 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Options
|
||||
IOS_MIN_OS_VERSION=16.4
|
||||
MACOS_MIN_OS_VERSION=13.3
|
||||
VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
WHISPER_BUILD_EXAMPLES=OFF
|
||||
WHISPER_BUILD_TESTS=OFF
|
||||
WHISPER_BUILD_SERVER=OFF
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
GGML_METAL_USE_BF16=ON
|
||||
GGML_OPENMP=OFF
|
||||
|
||||
COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
|
||||
# Common options for all builds
|
||||
COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
|
||||
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
|
||||
-DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
|
||||
-DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DWHISPER_BUILD_EXAMPLES=${WHISPER_BUILD_EXAMPLES}
|
||||
-DWHISPER_BUILD_TESTS=${WHISPER_BUILD_TESTS}
|
||||
-DWHISPER_BUILD_SERVER=${WHISPER_BUILD_SERVER}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
-DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
|
||||
-DGGML_NATIVE=OFF
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
|
||||
if ! command -v $tool &> /dev/null; then
|
||||
echo "Error: $tool is required but not found."
|
||||
echo "$install_message"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "Checking for required tools..."
|
||||
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
|
||||
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
|
||||
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
|
||||
set -e
|
||||
|
||||
## Clean up previous builds
|
||||
rm -rf build-apple
|
||||
rm -rf build-ios-sim
|
||||
rm -rf build-ios-device
|
||||
rm -rf build-macos
|
||||
rm -rf build-visionos
|
||||
rm -rf build-visionos-sim
|
||||
rm -rf build-tvos-sim
|
||||
rm -rf build-tvos-device
|
||||
|
||||
# Setup the xcframework build directory structure
|
||||
setup_framework_structure() {
|
||||
local build_dir=$1
|
||||
local min_os_version=$2
|
||||
local platform=$3 # "ios", "macos", "visionos", or "tvos"
|
||||
local framework_name="whisper"
|
||||
|
||||
echo "Creating ${platform}-style framework structure for ${build_dir}"
|
||||
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS versioned structure uses versioned directories
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
|
||||
|
||||
# Create symbolic links
|
||||
ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
|
||||
ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
|
||||
ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
|
||||
else
|
||||
# iOS/VisionOS/tvOS use a flat structure
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
|
||||
mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
|
||||
|
||||
# Remove any existing structure to ensure clean build
|
||||
rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
|
||||
|
||||
# Set header and module paths
|
||||
local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
|
||||
local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
|
||||
fi
|
||||
|
||||
# Copy all required headers (common for all platforms)
|
||||
cp include/whisper.h ${header_path}
|
||||
cp ggml/include/ggml.h ${header_path}
|
||||
cp ggml/include/ggml-alloc.h ${header_path}
|
||||
cp ggml/include/ggml-backend.h ${header_path}
|
||||
cp ggml/include/ggml-metal.h ${header_path}
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
framework module whisper {
|
||||
header "whisper.h"
|
||||
header "ggml.h"
|
||||
header "ggml-alloc.h"
|
||||
header "ggml-backend.h"
|
||||
header "ggml-metal.h"
|
||||
header "ggml-cpu.h"
|
||||
header "ggml-blas.h"
|
||||
header "gguf.h"
|
||||
|
||||
link "c++"
|
||||
link framework "Accelerate"
|
||||
link framework "Metal"
|
||||
link framework "Foundation"
|
||||
|
||||
export *
|
||||
}
|
||||
EOF
|
||||
|
||||
# Platform-specific settings for Info.plist
|
||||
local platform_name=""
|
||||
local sdk_name=""
|
||||
local supported_platform=""
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
platform_name="iphoneos"
|
||||
sdk_name="iphoneos${min_os_version}"
|
||||
supported_platform="iPhoneOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>1</integer>
|
||||
<integer>2</integer>
|
||||
</array>'
|
||||
;;
|
||||
"macos")
|
||||
platform_name="macosx"
|
||||
sdk_name="macosx${min_os_version}"
|
||||
supported_platform="MacOSX"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"visionos")
|
||||
platform_name="xros"
|
||||
sdk_name="xros${min_os_version}"
|
||||
supported_platform="XRPlatform"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=""
|
||||
;;
|
||||
"tvos")
|
||||
platform_name="appletvos"
|
||||
sdk_name="appletvos${min_os_version}"
|
||||
supported_platform="AppleTVOS"
|
||||
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
|
||||
local device_family=' <key>UIDeviceFamily</key>
|
||||
<array>
|
||||
<integer>3</integer>
|
||||
</array>'
|
||||
;;
|
||||
esac
|
||||
|
||||
# Create Info.plist
|
||||
cat > ${plist_path} << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>en</string>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>whisper</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>org.ggml.whisper</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>whisper</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>FMWK</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
<key>MinimumOSVersion</key>
|
||||
<string>${min_os_version}</string>
|
||||
<key>CFBundleSupportedPlatforms</key>
|
||||
<array>
|
||||
<string>${supported_platform}</string>
|
||||
</array>${device_family}
|
||||
<key>DTPlatformName</key>
|
||||
<string>${platform_name}</string>
|
||||
<key>DTSDKName</key>
|
||||
<string>${sdk_name}</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
}
|
||||
|
||||
# Create dynamic libraries from static libraries.
|
||||
combine_static_libraries() {
|
||||
local build_dir="$1"
|
||||
local release_dir="$2"
|
||||
local platform="$3" # "ios", "macos", "visionos", or "tvos"
|
||||
local is_simulator="$4"
|
||||
local base_dir="$(pwd)"
|
||||
local framework_name="whisper"
|
||||
|
||||
# Determine output path based on platform
|
||||
local output_lib=""
|
||||
if [[ "$platform" == "macos" ]]; then
|
||||
# macOS uses versioned structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
|
||||
else
|
||||
# iOS, visionOS, and tvOS use a directory flat structure
|
||||
output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
|
||||
fi
|
||||
|
||||
local libs=(
|
||||
"${base_dir}/${build_dir}/src/${release_dir}/libwhisper.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
)
|
||||
if [[ "$platform" == "macos" || "$platform" == "ios" ]]; then
|
||||
echo "Adding libwhisper.coreml library to the build."
|
||||
libs+=(
|
||||
"${base_dir}/${build_dir}/src/${release_dir}/libwhisper.coreml.a"
|
||||
)
|
||||
fi
|
||||
|
||||
# Create temporary directory for processing
|
||||
local temp_dir="${base_dir}/${build_dir}/temp"
|
||||
echo "Creating temporary directory: ${temp_dir}"
|
||||
mkdir -p "${temp_dir}"
|
||||
|
||||
# Since we have multiple architectures libtool will find object files that do not
|
||||
# match the target architecture. We suppress these warnings.
|
||||
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
|
||||
|
||||
# Determine SDK, architectures, and install_name based on platform and simulator flag.
|
||||
local sdk=""
|
||||
local archs=""
|
||||
local min_version_flag=""
|
||||
local install_name=""
|
||||
local frameworks="-framework Foundation -framework Metal -framework Accelerate"
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="iphonesimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="iphoneos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/whisper.framework/whisper"
|
||||
frameworks+=" -framework CoreML"
|
||||
;;
|
||||
"macos")
|
||||
sdk="macosx"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
|
||||
install_name="@rpath/whisper.framework/Versions/Current/whisper"
|
||||
frameworks+=" -framework CoreML"
|
||||
;;
|
||||
"visionos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="xrsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
|
||||
else
|
||||
sdk="xros"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
# Use flat structure for visionOS, same as iOS
|
||||
install_name="@rpath/whisper.framework/whisper"
|
||||
;;
|
||||
"tvos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
sdk="appletvsimulator"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
else
|
||||
sdk="appletvos"
|
||||
archs="arm64"
|
||||
min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/whisper.framework/whisper"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Build architecture flags
|
||||
local arch_flags=""
|
||||
for arch in $archs; do
|
||||
arch_flags+=" -arch $arch"
|
||||
done
|
||||
|
||||
# Create dynamic library
|
||||
echo "Creating dynamic library for ${platform}."
|
||||
xcrun -sdk $sdk clang++ -dynamiclib \
|
||||
-isysroot $(xcrun --sdk $sdk --show-sdk-path) \
|
||||
$arch_flags \
|
||||
$min_version_flag \
|
||||
-Wl,-force_load,"${temp_dir}/combined.a" \
|
||||
$frameworks \
|
||||
-install_name "$install_name" \
|
||||
-o "${base_dir}/${output_lib}"
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v xcrun vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
|
||||
echo "Xcode version greater than 16.2, using visionOS."
|
||||
VISION_OS_BUILD_VERSION="visionos"
|
||||
else
|
||||
echo "Xcode version less than or equal to 16.2, using xros."
|
||||
VISION_OS_BUILD_VERSION="xros"
|
||||
fi
|
||||
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
echo "Warning: vtool not found. Binary may not pass App Store validation."
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Creating properly formatted dSYM..."
|
||||
# Create a separate directory for dSYMs for all platforms
|
||||
mkdir -p "${base_dir}/${build_dir}/dSYMs"
|
||||
|
||||
# iOS and visionOS style dSYM (flat structure)
|
||||
if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/whisper.dSYM"
|
||||
|
||||
# Create a copy of the binary that will be stripped
|
||||
cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
|
||||
|
||||
# Strip debug symbols from the copy
|
||||
xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Replace the original with the stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
else
|
||||
# macOS style dSYM
|
||||
# First strip debug info to a separate file
|
||||
xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
|
||||
|
||||
# Generate dSYM in the dSYMs directory
|
||||
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/whisper.dSYM"
|
||||
|
||||
# Replace original binary with stripped version
|
||||
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
|
||||
fi
|
||||
|
||||
# Remove any automatically generated dSYM files in the framework structure as they will
|
||||
# otherwise case Invalid Bundle Structure validation errors.
|
||||
if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
|
||||
echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
|
||||
rm -rf "${base_dir}/${output_lib}.dSYM"
|
||||
fi
|
||||
|
||||
# Clean up
|
||||
rm -rf "${temp_dir}"
|
||||
}
|
||||
|
||||
echo "Building for iOS simulator..."
|
||||
cmake -B build-ios-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DIOS=ON \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_SYSROOT=iphonesimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for iOS devices..."
|
||||
cmake -B build-ios-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_SYSROOT=iphoneos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
echo "Building for macOS..."
|
||||
cmake -B build-macos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS..."
|
||||
cmake -B build-visionos -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
|
||||
echo "Building for visionOS simulator..."
|
||||
cmake -B build-visionos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
|
||||
echo "Building for tvOS simulator..."
|
||||
cmake -B build-tvos-sim -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvsimulator \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
|
||||
echo "Building for tvOS devices..."
|
||||
cmake -B build-tvos-device -G Xcode \
|
||||
"${COMMON_CMAKE_ARGS[@]}" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_SYSROOT=appletvos \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64" \
|
||||
-DGGML_METAL=ON \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
|
||||
# Setup frameworks and copy binaries and headers
|
||||
echo "Setting up framework structures..."
|
||||
setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
|
||||
setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
|
||||
setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
|
||||
setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
|
||||
|
||||
# Create dynamic libraries from static libraries
|
||||
echo "Creating dynamic libraries from static libraries..."
|
||||
combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
|
||||
combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
|
||||
combine_static_libraries "build-macos" "Release" "macos" "false"
|
||||
combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
|
||||
combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
|
||||
combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
|
||||
combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
|
||||
|
||||
# Create XCFramework with correct debug symbols paths
|
||||
echo "Creating XCFramework..."
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-ios-sim/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-ios-device/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-visionos-sim/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-tvos-device/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-device/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-tvos-sim/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-tvos-sim/dSYMs/whisper.dSYM \
|
||||
-output $(pwd)/build-apple/whisper.xcframework
|
41
ci/README.md
41
ci/README.md
@ -1,41 +0,0 @@
|
||||
# CI
|
||||
|
||||
In addition to [Github Actions](https://github.com/ggerganov/whisper.cpp/actions) `whisper.cpp` uses a custom CI framework:
|
||||
|
||||
https://github.com/ggml-org/ci
|
||||
|
||||
It monitors the `master` branch for new commits and runs the
|
||||
[ci/run.sh](https://github.com/ggerganov/whisper.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||
|
||||
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
||||
Only the branches of this repo are monitored for this keyword.
|
||||
|
||||
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
||||
|
||||
```bash
|
||||
mkdir tmp
|
||||
|
||||
# CPU-only build
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with CUDA support
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The CI script supports several environment variables to control the build:
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `GG_BUILD_CUDA` | Enable NVIDIA CUDA GPU acceleration |
|
||||
| `GG_BUILD_SYCL` | Enable Intel SYCL acceleration |
|
||||
| `GG_BUILD_VULKAN` | Enable Vulkan GPU acceleration |
|
||||
| `GG_BUILD_METAL` | Enable Metal acceleration on Apple Silicon |
|
||||
| `GG_BUILD_BLAS` | Enable BLAS CPU acceleration |
|
||||
| `GG_BUILD_OPENVINO` | Enable OpenVINO support |
|
||||
| `GG_BUILD_COREML` | Enable Core ML support for Apple Neural Engine |
|
||||
| `GG_BUILD_LOW_PERF` | Limit tests for low-performance hardware |
|
||||
| `GG_BUILD_TEST_MODELS` | Comma-separated list of models to test (e.g. "tiny.en,tiny,base,medium", defaults to all models unless `GG_BUILD_LOW_PERF` is set) |
|
336
ci/run.sh
336
ci/run.sh
@ -1,336 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
# mkdir tmp
|
||||
#
|
||||
# # CPU-only build
|
||||
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$1"
|
||||
mkdir -p "$2"
|
||||
|
||||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -f "$OUT/*.log"
|
||||
rm -f "$OUT/*.exit"
|
||||
rm -f "$OUT/*.md"
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
ALL_MODELS=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )
|
||||
BENCH_N_THREADS=4
|
||||
BENCH_ENCODER_ONLY=0
|
||||
BENCH_FLASH_ATTN=0
|
||||
|
||||
# check for user-specified models first. if not specified, use fast models
|
||||
if [ ! -z ${GG_BUILD_TEST_MODELS} ]; then
|
||||
IFS=',' read -r -a MODELS <<< "${GG_BUILD_TEST_MODELS}"
|
||||
else
|
||||
if [ ! -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
MODELS=( "tiny" "base" "small" )
|
||||
else
|
||||
MODELS=("${ALL_MODELS[@]}")
|
||||
fi
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="-DWHISPER_FATAL_WARNINGS=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
if [ -z ${ONEAPI_ROOT} ]; then
|
||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
|
||||
echo "source /opt/intel/oneapi/setvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DWHISPER_OPENVINO=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_COREML} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DWHISPER_COREML=ON"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
function gg_wget {
|
||||
local out=$1
|
||||
local url=$2
|
||||
|
||||
local cwd=`pwd`
|
||||
|
||||
mkdir -p $out
|
||||
cd $out
|
||||
|
||||
# should not re-download if file is the same
|
||||
wget -nv -N $url
|
||||
|
||||
cd $cwd
|
||||
}
|
||||
|
||||
function gg_download_model {
|
||||
local model_name=$1
|
||||
local model_file="$MNT/models/ggml-${model_name}.bin"
|
||||
|
||||
if [ ! -f ${model_file} ]; then
|
||||
local cwd=`pwd`
|
||||
mkdir -p "$MNT/models"
|
||||
cd "$MNT/models"
|
||||
bash "$cwd/models/download-ggml-model.sh" ${model_name} .
|
||||
cd "$cwd"
|
||||
fi
|
||||
}
|
||||
|
||||
function gg_printf {
|
||||
printf -- "$@" >> $OUT/README.md
|
||||
}
|
||||
|
||||
# Helper function to check command exit status
|
||||
function gg_check_last_command_status {
|
||||
local exit_file=$1
|
||||
local command_name=$2
|
||||
|
||||
local exit_status=$?
|
||||
echo "$exit_status" > "$exit_file"
|
||||
|
||||
if [ $exit_status -ne 0 ]; then
|
||||
echo "Error: Command $command_name failed with exit status $exit_status"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# Usage: gg_run <test_name> [additional_args...]
|
||||
#
|
||||
# Parameters:
|
||||
# test_name - Name of the test to run (calls gg_run_<test_name>)
|
||||
# additional_args - Any additional arguments to pass to the test function (first argument is appended to the log filename)
|
||||
function gg_run {
|
||||
ci=$1
|
||||
|
||||
if [ $# -gt 1 ]; then
|
||||
ci="${ci}_${2}"
|
||||
fi
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
gg_run_$1 "$@" | tee $OUT/$ci.log
|
||||
cur=$?
|
||||
echo "$cur" > $OUT/$ci.exit
|
||||
|
||||
set +x
|
||||
set +o pipefail
|
||||
|
||||
gg_sum_$1 "$@"
|
||||
|
||||
ret=$((ret | cur))
|
||||
}
|
||||
|
||||
function gg_check_build_requirements {
|
||||
if ! command -v cmake &> /dev/null; then
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v make &> /dev/null; then
|
||||
gg_printf 'make not found, please install'
|
||||
fi
|
||||
}
|
||||
|
||||
## ci
|
||||
|
||||
function gg_run_ctest {
|
||||
mode=$2
|
||||
|
||||
cd ${SRC}
|
||||
|
||||
rm -rf build-ci-${mode} && mkdir build-ci-${mode} && cd build-ci-${mode}
|
||||
|
||||
set -e
|
||||
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=${mode} ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_ctest {
|
||||
mode=$2
|
||||
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest in '${mode}' mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
function gg_run_bench {
|
||||
cd ${SRC}
|
||||
|
||||
# set flash attention flag if enabled
|
||||
fattn=""
|
||||
if [ "$BENCH_FLASH_ATTN" -eq 1 ]; then
|
||||
fattn="-fa"
|
||||
fi
|
||||
|
||||
# run memcpy benchmark if not encoder-only mode
|
||||
if [ "$BENCH_ENCODER_ONLY" -eq 0 ]; then
|
||||
echo "Running memcpy benchmark"
|
||||
(time ./build-ci-release/bin/whisper-bench -w 1 -t $BENCH_N_THREADS 2>&1) | tee -a $OUT/${ci}-memcpy.log
|
||||
gg_check_last_command_status "$OUT/${ci}-memcpy.exit" "memcpy benchmark"
|
||||
|
||||
echo "Running ggml_mul_mat benchmark with $BENCH_N_THREADS threads"
|
||||
(time ./build-ci-release/bin/whisper-bench -w 2 -t $BENCH_N_THREADS 2>&1) | tee -a $OUT/${ci}-mul_mat.log
|
||||
gg_check_last_command_status "$OUT/${ci}-mul_mat.exit" "ggml_mul_mat benchmark"
|
||||
fi
|
||||
|
||||
echo "Running benchmark for all models"
|
||||
|
||||
# generate header for the benchmark table
|
||||
{
|
||||
printf "| %16s | %13s | %3s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "Config" "Model" "Th" "FA" "Enc." "Dec." "Bch5" "PP" "Commit"
|
||||
printf "| %16s | %13s | %3s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
||||
} | tee -a $OUT/${ci}-models-table.log
|
||||
|
||||
# run benchmark for each model
|
||||
for model in "${MODELS[@]}"; do
|
||||
echo "Benchmarking model: $model"
|
||||
|
||||
# run the benchmark and capture output
|
||||
output=$(./build-ci-release/bin/whisper-bench -m $MNT/models/ggml-$model.bin -t $BENCH_N_THREADS $fattn 2>&1)
|
||||
ret=$?
|
||||
|
||||
# save the raw output
|
||||
echo "$output" > $OUT/${ci}-bench-$model.log
|
||||
|
||||
if [ $ret -eq 0 ]; then
|
||||
# parse the benchmark results
|
||||
encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
|
||||
decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
|
||||
batchd_time=$(echo "$output" | grep "batchd time" | awk '{print $11}')
|
||||
prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
|
||||
system_info=$(echo "$output" | grep "system_info")
|
||||
actual_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
|
||||
|
||||
# determine configuration
|
||||
config=""
|
||||
if [[ $system_info == *"AVX2 = 1"* ]]; then
|
||||
config="$config AVX2"
|
||||
fi
|
||||
if [[ $system_info == *"NEON = 1"* ]]; then
|
||||
config="$config NEON"
|
||||
fi
|
||||
if [[ $system_info == *"BLAS = 1"* ]]; then
|
||||
config="$config BLAS"
|
||||
fi
|
||||
if [[ $system_info == *"COREML = 1"* ]]; then
|
||||
config="$config COREML"
|
||||
fi
|
||||
if [[ $system_info == *"CUDA = 1"* ]]; then
|
||||
config="$config CUDA"
|
||||
fi
|
||||
if [[ $system_info == *"METAL = 1"* ]]; then
|
||||
config="$config METAL"
|
||||
fi
|
||||
|
||||
# get commit hash
|
||||
commit=$(git rev-parse --short HEAD)
|
||||
|
||||
# add row to benchmark table
|
||||
printf "| %16s | %13s | %3s | %3s | %7s | %7s | %7s | %7s | %7s |\n" \
|
||||
"$config" "$model" "$actual_threads" "$BENCH_FLASH_ATTN" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit" \
|
||||
| tee -a $OUT/${ci}-models-table.log
|
||||
else
|
||||
echo "Benchmark failed for model: $model" | tee -a $OUT/${ci}-bench-errors.log
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
function gg_sum_bench {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Whisper Benchmark Results\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
|
||||
# show memcpy and ggml_mul_mat benchmark results if available
|
||||
if [ "$BENCH_ENCODER_ONLY" -eq 0 ]; then
|
||||
if [ -f "$OUT/${ci}-memcpy.log" ]; then
|
||||
gg_printf '#### memcpy Benchmark\n\n'
|
||||
gg_printf '```\n%s\n```\n\n' "$(cat $OUT/${ci}-memcpy.log)"
|
||||
fi
|
||||
|
||||
if [ -f "$OUT/${ci}-mul_mat.log" ]; then
|
||||
gg_printf '#### ggml_mul_mat Benchmark\n\n'
|
||||
gg_printf '```\n%s\n```\n\n' "$(cat $OUT/${ci}-mul_mat.log)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# show model benchmark results
|
||||
gg_printf '#### Model Benchmarks\n\n'
|
||||
if [ -f "$OUT/${ci}-models-table.log" ]; then
|
||||
gg_printf '%s\n\n' "$(cat $OUT/${ci}-models-table.log)"
|
||||
else
|
||||
gg_printf 'No model benchmark results available.\n\n'
|
||||
fi
|
||||
|
||||
# show any errors that occurred
|
||||
if [ -f "$OUT/${ci}-bench-errors.log" ]; then
|
||||
gg_printf '#### Benchmark Errors\n\n'
|
||||
gg_printf '```\n%s\n```\n\n' "$(cat $OUT/${ci}-bench-errors.log)"
|
||||
fi
|
||||
}
|
||||
|
||||
ret=0
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
test $ret -eq 0 && gg_download_model ${model}
|
||||
done
|
||||
if [ -z ${GG_BUILD_SYCL}]; then
|
||||
test $ret -eq 0 && gg_run ctest debug
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest release
|
||||
|
||||
test $ret -eq 0 && gg_run bench
|
||||
|
||||
exit $ret
|
@ -14,6 +14,10 @@ if (WHISPER_SDL2)
|
||||
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
|
||||
endif()
|
||||
|
||||
if (WHISPER_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
endif()
|
||||
|
||||
# common
|
||||
|
||||
set(TARGET common)
|
||||
@ -52,8 +56,6 @@ add_library(${TARGET} STATIC
|
||||
common.cpp
|
||||
common-ggml.h
|
||||
common-ggml.cpp
|
||||
common-whisper.h
|
||||
common-whisper.cpp
|
||||
grammar-parser.h
|
||||
grammar-parser.cpp
|
||||
${COMMON_SOURCES_FFMPEG}
|
||||
@ -61,7 +63,7 @@ add_library(${TARGET} STATIC
|
||||
|
||||
include(DefaultTargetOptions)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE whisper ${COMMON_EXTRA_LIBS} ${CMAKE_DL_LIBS})
|
||||
target_link_libraries(${TARGET} PRIVATE whisper ${COMMON_EXTRA_LIBS})
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
|
||||
@ -105,7 +107,6 @@ else()
|
||||
add_subdirectory(bench)
|
||||
add_subdirectory(server)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(vad-speech-segments)
|
||||
if (WHISPER_SDL2)
|
||||
add_subdirectory(stream)
|
||||
add_subdirectory(command)
|
||||
|
@ -17,23 +17,14 @@ const whisperParamsMock = {
|
||||
comma_in_time: false,
|
||||
translate: true,
|
||||
no_timestamps: false,
|
||||
detect_language: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
prompt: "",
|
||||
print_progress: false,
|
||||
progress_callback: (progress) => {
|
||||
console.log(`Progress: ${progress}`);
|
||||
},
|
||||
max_context: -1
|
||||
};
|
||||
|
||||
describe("Run whisper.node", () => {
|
||||
test("it should receive a non-empty value", async () => {
|
||||
let result = await whisperAsync(whisperParamsMock);
|
||||
console.log(result);
|
||||
|
||||
expect(result['transcription'].length).toBeGreaterThan(0);
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include "napi.h"
|
||||
#include "common.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
@ -38,7 +37,6 @@ struct whisper_params {
|
||||
bool print_progress = false;
|
||||
bool no_timestamps = false;
|
||||
bool no_prints = false;
|
||||
bool detect_language= false;
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = false;
|
||||
bool comma_in_time = true;
|
||||
@ -83,7 +81,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
||||
t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
}
|
||||
|
||||
if (!params.no_timestamps && !params.no_prints) {
|
||||
if (!params.no_timestamps) {
|
||||
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
||||
}
|
||||
|
||||
@ -114,14 +112,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
||||
|
||||
// colorful print bug
|
||||
//
|
||||
if (!params.no_prints) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
printf("%s%s", speaker.c_str(), text);
|
||||
}
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
printf("%s%s", speaker.c_str(), text);
|
||||
|
||||
|
||||
// with timestamps or speakers: each segment on new line
|
||||
if ((!params.no_timestamps || params.diarize) && !params.no_prints) {
|
||||
if (!params.no_timestamps || params.diarize) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@ -131,249 +127,192 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
||||
|
||||
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
|
||||
|
||||
struct whisper_result {
|
||||
std::vector<std::vector<std::string>> segments;
|
||||
std::string language;
|
||||
};
|
||||
|
||||
class ProgressWorker : public Napi::AsyncWorker {
|
||||
public:
|
||||
ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
|
||||
: Napi::AsyncWorker(callback), params(params), env(env) {
|
||||
// Create thread-safe function
|
||||
if (!progress_callback.IsEmpty()) {
|
||||
tsfn = Napi::ThreadSafeFunction::New(
|
||||
env,
|
||||
progress_callback,
|
||||
"Progress Callback",
|
||||
0,
|
||||
1
|
||||
);
|
||||
}
|
||||
int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
||||
if (params.no_prints) {
|
||||
whisper_log_set(cb_log_disable, NULL);
|
||||
}
|
||||
|
||||
~ProgressWorker() {
|
||||
if (tsfn) {
|
||||
// Make sure to release the thread-safe function on destruction
|
||||
tsfn.Release();
|
||||
}
|
||||
if (params.fname_inp.empty() && params.pcmf32.empty()) {
|
||||
fprintf(stderr, "error: no input files or audio buffer specified\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
void Execute() override {
|
||||
// Use custom run function with progress callback support
|
||||
run_with_progress(params, result);
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
// whisper init
|
||||
|
||||
if (params.detect_language) {
|
||||
Napi::Object resultObj = Napi::Object::New(Env());
|
||||
resultObj.Set("language", Napi::String::New(Env(), result.language));
|
||||
Callback().Call({Env().Null(), resultObj});
|
||||
}
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
|
||||
Napi::Object returnObj = Napi::Object::New(Env());
|
||||
if (!result.language.empty()) {
|
||||
returnObj.Set("language", Napi::String::New(Env(), result.language));
|
||||
}
|
||||
Napi::Array transcriptionArray = Napi::Array::New(Env(), result.segments.size());
|
||||
for (uint64_t i = 0; i < result.segments.size(); ++i) {
|
||||
Napi::Object tmp = Napi::Array::New(Env(), 3);
|
||||
for (uint64_t j = 0; j < 3; ++j) {
|
||||
tmp[j] = Napi::String::New(Env(), result.segments[i][j]);
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
// if params.pcmf32 is provided, set params.fname_inp to "buffer"
|
||||
// this is simpler than further modifications in the code
|
||||
if (!params.pcmf32.empty()) {
|
||||
fprintf(stderr, "info: using audio buffer as input\n");
|
||||
params.fname_inp.clear();
|
||||
params.fname_inp.emplace_back("buffer");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
// read the input audio file if params.pcmf32 is not provided
|
||||
if (params.pcmf32.empty()) {
|
||||
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
|
||||
continue;
|
||||
}
|
||||
transcriptionArray[i] = tmp;
|
||||
}
|
||||
returnObj.Set("transcription", transcriptionArray);
|
||||
Callback().Call({Env().Null(), returnObj});
|
||||
}
|
||||
} else {
|
||||
pcmf32 = params.pcmf32;
|
||||
}
|
||||
|
||||
// Progress callback function - using thread-safe function
|
||||
void OnProgress(int progress) {
|
||||
if (tsfn) {
|
||||
// Use thread-safe function to call JavaScript callback
|
||||
auto callback = [progress](Napi::Env env, Napi::Function jsCallback) {
|
||||
jsCallback.Call({Napi::Number::New(env, progress)});
|
||||
};
|
||||
|
||||
tsfn.BlockingCall(callback);
|
||||
// print system information
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
}
|
||||
|
||||
// print some info about the processing
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
|
||||
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, params.n_processors,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1,
|
||||
params.audio_ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// run the inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
||||
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = params.print_progress;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
wparams.duration_ms = params.duration_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// example for abort mechanism
|
||||
// in this example, we do not abort the processing, but we could if the flag is set to true
|
||||
// the callback is called before every encoder run - if it returns false, the processing is aborted
|
||||
{
|
||||
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
result.resize(n_segments);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
||||
result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
||||
result[i].emplace_back(text);
|
||||
}
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
class Worker : public Napi::AsyncWorker {
|
||||
public:
|
||||
Worker(Napi::Function& callback, whisper_params params)
|
||||
: Napi::AsyncWorker(callback), params(params) {}
|
||||
|
||||
void Execute() override {
|
||||
run(params, result);
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
Napi::Object res = Napi::Array::New(Env(), result.size());
|
||||
for (uint64_t i = 0; i < result.size(); ++i) {
|
||||
Napi::Object tmp = Napi::Array::New(Env(), 3);
|
||||
for (uint64_t j = 0; j < 3; ++j) {
|
||||
tmp[j] = Napi::String::New(Env(), result[i][j]);
|
||||
}
|
||||
res[i] = tmp;
|
||||
}
|
||||
Callback().Call({Env().Null(), res});
|
||||
}
|
||||
|
||||
private:
|
||||
whisper_params params;
|
||||
whisper_result result;
|
||||
Napi::Env env;
|
||||
Napi::ThreadSafeFunction tsfn;
|
||||
|
||||
// Custom run function with progress callback support
|
||||
int run_with_progress(whisper_params ¶ms, whisper_result & result) {
|
||||
if (params.no_prints) {
|
||||
whisper_log_set(cb_log_disable, NULL);
|
||||
}
|
||||
|
||||
if (params.fname_inp.empty() && params.pcmf32.empty()) {
|
||||
fprintf(stderr, "error: no input files or audio buffer specified\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
// If params.pcmf32 provides, set params.fname_inp as "buffer"
|
||||
if (!params.pcmf32.empty()) {
|
||||
fprintf(stderr, "info: using audio buffer as input\n");
|
||||
params.fname_inp.clear();
|
||||
params.fname_inp.emplace_back("buffer");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
// If params.pcmf32 is empty, read input audio file
|
||||
if (params.pcmf32.empty()) {
|
||||
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
pcmf32 = params.pcmf32;
|
||||
}
|
||||
|
||||
// Print system info
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
}
|
||||
|
||||
// Print processing info
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
|
||||
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, params.n_processors,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1,
|
||||
params.audio_ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Run inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
||||
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = params.print_progress;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.detect_language ? "auto" : params.language.c_str();
|
||||
wparams.detect_language = params.detect_language;
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
wparams.duration_ms = params.duration_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
// This callback is called for each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// Set progress callback
|
||||
wparams.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||
ProgressWorker* worker = static_cast<ProgressWorker*>(user_data);
|
||||
worker->OnProgress(progress);
|
||||
};
|
||||
wparams.progress_callback_user_data = this;
|
||||
|
||||
// Abort mechanism example
|
||||
{
|
||||
static bool is_aborted = false; // Note: this should be atomic to avoid data races
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.detect_language || params.language == "auto") {
|
||||
result.language = whisper_lang_str(whisper_full_lang_id(ctx));
|
||||
}
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
result.segments.resize(n_segments);
|
||||
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
result.segments[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
||||
result.segments[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
||||
result.segments[i].emplace_back(text);
|
||||
}
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
whisper_params params;
|
||||
std::vector<std::vector<std::string>> result;
|
||||
};
|
||||
|
||||
|
||||
|
||||
Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() <= 0 || !info[0].IsObject()) {
|
||||
@ -389,33 +328,9 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
|
||||
bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
||||
bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
||||
bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
|
||||
int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
||||
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
||||
int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
||||
|
||||
// Add support for max_context
|
||||
int32_t max_context = -1;
|
||||
if (whisper_params.Has("max_context") && whisper_params.Get("max_context").IsNumber()) {
|
||||
max_context = whisper_params.Get("max_context").As<Napi::Number>();
|
||||
}
|
||||
|
||||
// support prompt
|
||||
std::string prompt = "";
|
||||
if (whisper_params.Has("prompt") && whisper_params.Get("prompt").IsString()) {
|
||||
prompt = whisper_params.Get("prompt").As<Napi::String>();
|
||||
}
|
||||
|
||||
// Add support for print_progress
|
||||
bool print_progress = false;
|
||||
if (whisper_params.Has("print_progress")) {
|
||||
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
||||
}
|
||||
// Add support for progress_callback
|
||||
Napi::Function progress_callback;
|
||||
if (whisper_params.Has("progress_callback") && whisper_params.Get("progress_callback").IsFunction()) {
|
||||
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
||||
}
|
||||
|
||||
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
||||
std::vector<float> pcmf32_vec;
|
||||
@ -439,14 +354,9 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
params.pcmf32 = pcmf32_vec;
|
||||
params.comma_in_time = comma_in_time;
|
||||
params.max_len = max_len;
|
||||
params.max_context = max_context;
|
||||
params.print_progress = print_progress;
|
||||
params.prompt = prompt;
|
||||
params.detect_language = detect_language;
|
||||
|
||||
Napi::Function callback = info[1].As<Napi::Function>();
|
||||
// Create a new Worker class with progress callback support
|
||||
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
|
||||
Worker* worker = new Worker(callback, params);
|
||||
worker->Queue();
|
||||
return env.Undefined();
|
||||
}
|
||||
|
@ -17,12 +17,8 @@ const whisperParams = {
|
||||
comma_in_time: false,
|
||||
translate: true,
|
||||
no_timestamps: false,
|
||||
detect_language: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
progress_callback: (progress) => {
|
||||
console.log(`progress: ${progress}%`);
|
||||
}
|
||||
};
|
||||
|
||||
const arguments = process.argv.slice(2);
|
||||
@ -32,8 +28,6 @@ const params = Object.fromEntries(
|
||||
const [key, value] = item.slice(2).split("=");
|
||||
if (key === "audio_ctx") {
|
||||
whisperParams[key] = parseInt(value);
|
||||
} else if (key === "detect_language") {
|
||||
whisperParams[key] = value === "true";
|
||||
} else {
|
||||
whisperParams[key] = value;
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
-s INITIAL_MEMORY=2000MB \
|
||||
-s TOTAL_MEMORY=2000MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap', 'HEAPU8']\" \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
Benchmark the performance of whisper.cpp in the browser using WebAssembly
|
||||
|
||||
Link: https://ggerganov.github.io/whisper.cpp/bench.wasm
|
||||
Link: https://whisper.ggerganov.com/bench/
|
||||
|
||||
Terminal version: [examples/bench](/examples/bench)
|
||||
|
||||
@ -15,23 +15,8 @@ cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/bench.wasm
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/bench.wasm/* /path/to/html/
|
||||
cp bin/libbench.js /path/to/html/
|
||||
cp bin/libbench.worker.js /path/to/html/
|
||||
```
|
||||
|
||||
> 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no
|
||||
> longer generated and the worker is embedded in the main JS file. So the worker
|
||||
> file will not be geneated for versions later than `3.1.58`.
|
||||
|
@ -24,8 +24,6 @@
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
<script src="../coi-serviceworker.js"></script>
|
||||
<link rel="icon" href="data:,">
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
@ -38,10 +36,11 @@
|
||||
<br><br>
|
||||
|
||||
<b>More examples:</b>
|
||||
<a href="../">main</a> |
|
||||
<a href="../bench.wasm/">bench</a> |
|
||||
<a href="../stream.wasm">stream</a> |
|
||||
<a href="../command.wasm/">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/">main</a> |
|
||||
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
||||
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
||||
<a href="https://whisper.ggerganov.com/command">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
||||
|
||||
<br><br>
|
||||
|
||||
|
@ -4,7 +4,7 @@ A very basic tool for benchmarking the inference performance on your device. The
|
||||
the transformer on some random audio data and records the execution time. This way we can have an objective comparison
|
||||
of the performance of the model for various setups.
|
||||
|
||||
Benchmark results are tracked in the following Github issue: https://github.com/ggml-org/whisper.cpp/issues/89
|
||||
Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
|
||||
|
||||
```bash
|
||||
# run the bench too on the small.en model using 4 threads
|
||||
@ -40,7 +40,7 @@ system_info: n_threads = 4 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WA
|
||||
|
||||
If you wish, you can submit these results here:
|
||||
|
||||
https://github.com/ggml-org/whisper.cpp/issues/89
|
||||
https://github.com/ggerganov/whisper.cpp/issues/89
|
||||
|
||||
Please include the following information:
|
||||
|
||||
|
@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " %-7s 0 - whisper\n", "");
|
||||
fprintf(stderr, " %-7s 1 - memcpy\n", "");
|
||||
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -156,8 +156,6 @@ static int whisper_bench_full(const whisper_params & params) {
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_load_all();
|
||||
|
||||
whisper_params params;
|
||||
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
|
@ -6,8 +6,7 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
|
||||
```
|
||||
./build/bin/whisper-cli -h
|
||||
|
||||
usage: ./build/bin/whisper-cli [options] file0 file1 ...
|
||||
supported audio formats: flac, mp3, ogg, wav
|
||||
usage: ./build-pkg/bin/whisper-cli [options] file0.wav file1.wav ...
|
||||
|
||||
options:
|
||||
-h, --help [default] show this help message and exit
|
||||
@ -25,7 +24,6 @@ options:
|
||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
|
||||
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
|
||||
-nth N, --no-speech-thold N [0.60 ] no speech threshold
|
||||
-tp, --temperature N [0.00 ] The sampling temperature, between 0 and 1
|
||||
-tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1
|
||||
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
|
||||
@ -52,13 +50,12 @@ options:
|
||||
-dl, --detect-language [false ] exit after automatically detecting language
|
||||
--prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens)
|
||||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||
-f FNAME, --file FNAME [ ] input audio file path
|
||||
-f FNAME, --file FNAME [ ] input WAV file path
|
||||
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
||||
-dtw MODEL --dtw MODEL [ ] compute token-level timestamps
|
||||
-ls, --log-score [false ] log best decoder scores of tokens
|
||||
-ng, --no-gpu [false ] disable GPU
|
||||
-fa, --flash-attn [false ] flash attention
|
||||
-sns, --suppress-nst [false ] suppress non-speech tokens
|
||||
--suppress-regex REGEX [ ] regular expression matching tokens to suppress
|
||||
--grammar GRAMMAR [ ] GBNF grammar to guide decoding
|
||||
--grammar-rule RULE [ ] top-level GBNF grammar rule name
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include "common.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include "whisper.h"
|
||||
#include "grammar-parser.h"
|
||||
@ -7,19 +6,21 @@
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <cstdio>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <cfloat>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
// helper function to replace substrings
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
for (size_t pos = 0; ; pos += replace.length()) {
|
||||
@ -70,7 +71,6 @@ struct whisper_params {
|
||||
bool no_prints = false;
|
||||
bool print_special = false;
|
||||
bool print_colors = false;
|
||||
bool print_confidence= false;
|
||||
bool print_progress = false;
|
||||
bool no_timestamps = false;
|
||||
bool log_score = false;
|
||||
@ -99,16 +99,6 @@ struct whisper_params {
|
||||
std::vector<std::string> fname_out = {};
|
||||
|
||||
grammar_parser::parse_state grammar_parsed;
|
||||
|
||||
// Voice Activity Detection (VAD) parameters
|
||||
bool vad = false;
|
||||
std::string vad_model = "";
|
||||
float vad_threshold = 0.5f;
|
||||
int vad_min_speech_duration_ms = 250;
|
||||
int vad_min_silence_duration_ms = 100;
|
||||
float vad_max_speech_duration_s = FLT_MAX;
|
||||
int vad_speech_pad_ms = 30;
|
||||
float vad_samples_overlap = 0.1f;
|
||||
};
|
||||
|
||||
static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
@ -180,7 +170,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
||||
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||
else if ( arg == "--print-confidence"){ params.print_confidence= true; }
|
||||
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
||||
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
|
||||
@ -198,15 +187,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
||||
else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; }
|
||||
else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; }
|
||||
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
|
||||
// Voice Activity Detection (VAD)
|
||||
else if ( arg == "--vad") { params.vad = true; }
|
||||
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
|
||||
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
|
||||
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
@ -219,8 +199,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
||||
|
||||
static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
|
||||
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
|
||||
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
@ -259,14 +238,13 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
||||
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||
fprintf(stderr, " --print-confidence [%-7s] print confidence\n", params.print_confidence ? "true" : "false");
|
||||
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
||||
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
||||
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
|
||||
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
||||
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
|
||||
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
|
||||
@ -277,18 +255,6 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
||||
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
|
||||
fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str());
|
||||
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
|
||||
// Voice Activity Detection (VAD) parameters
|
||||
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
|
||||
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
|
||||
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
|
||||
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
|
||||
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
|
||||
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
|
||||
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
|
||||
std::string("FLT_MAX").c_str() :
|
||||
std::to_string(params.vad_max_speech_duration_s).c_str());
|
||||
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
|
||||
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -389,26 +355,6 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
|
||||
|
||||
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
||||
}
|
||||
} else if (params.print_confidence) {
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||
|
||||
int style_idx = 2; // High confidence - dim
|
||||
if (p < 0.33) {
|
||||
style_idx = 0; // Low confidence - inverse (highlighted)
|
||||
} else if (p < 0.66) {
|
||||
style_idx = 1; // Medium confidence - underlined
|
||||
}
|
||||
printf("%s%s%s%s", speaker.c_str(), k_styles[style_idx].c_str(), text, "\033[0m");
|
||||
}
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
@ -430,7 +376,15 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
|
||||
}
|
||||
}
|
||||
|
||||
static void output_txt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
@ -445,9 +399,19 @@ static void output_txt(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
|
||||
fout << speaker << text << "\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void output_vtt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
fout << "WEBVTT\n\n";
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
@ -467,9 +431,19 @@ static void output_vtt(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
|
||||
fout << speaker << text << "\n\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void output_srt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
@ -486,6 +460,8 @@ static void output_srt(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
|
||||
fout << speaker << text << "\n\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static char * escape_double_quotes_and_backslashes(const char * str) {
|
||||
@ -551,7 +527,15 @@ static char * escape_double_quotes_in_csv(const char * str) {
|
||||
return escaped;
|
||||
}
|
||||
|
||||
static void output_csv(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
fout << "start,end,";
|
||||
if (params.diarize && pcmf32s.size() == 2)
|
||||
@ -574,9 +558,14 @@ static void output_csv(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
}
|
||||
fout << "\"" << text_escaped << "\"\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void output_score(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
|
||||
static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
|
||||
std::ofstream fout(fname);
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
// fprintf(stderr,"segments: %d\n",n_segments);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
@ -589,14 +578,16 @@ static void output_score(struct whisper_context * ctx, std::ofstream & fout, con
|
||||
// fprintf(stderr,"token: %s %f\n",token,probability);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void output_json(
|
||||
static bool output_json(
|
||||
struct whisper_context * ctx,
|
||||
std::ofstream & fout,
|
||||
const char * fname,
|
||||
const whisper_params & params,
|
||||
std::vector<std::vector<float>> pcmf32s) {
|
||||
const bool full = params.output_jsn_full;
|
||||
std::vector<std::vector<float>> pcmf32s,
|
||||
bool full) {
|
||||
std::ofstream fout(fname);
|
||||
int indent = 0;
|
||||
|
||||
auto doindent = [&]() {
|
||||
@ -676,6 +667,12 @@ static void output_json(
|
||||
end_obj(end);
|
||||
};
|
||||
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
start_obj(nullptr);
|
||||
value_s("systeminfo", whisper_print_system_info(), false);
|
||||
start_obj("model");
|
||||
@ -749,12 +746,17 @@ static void output_json(
|
||||
|
||||
end_arr(true);
|
||||
end_obj(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
// karaoke video generation
|
||||
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
||||
// TODO: font parameter adjustments
|
||||
static bool output_wts(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s, const char * fname_inp, float t_sec, const char * fname_out) {
|
||||
static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
static const char * font = params.font_path.c_str();
|
||||
|
||||
std::ifstream fin(font);
|
||||
@ -870,12 +872,20 @@ static bool output_wts(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
|
||||
fout.close();
|
||||
|
||||
fprintf(stderr, "# %s: run 'source %s' to generate karaoke video\n", __func__, fname_out);
|
||||
fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
fout << "[by:whisper.cpp]\n";
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
@ -903,14 +913,14 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
|
||||
|
||||
fout << '[' << timestamp_lrc << ']' << speaker << text << "\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_load_all();
|
||||
|
||||
#if defined(_WIN32)
|
||||
// Set the console output code page to UTF-8, while command line arguments
|
||||
// are still encoded in the system's code page. In this way, we can print
|
||||
@ -990,6 +1000,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// whisper init
|
||||
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
@ -1052,61 +1063,14 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto & fname_inp = params.fname_inp[f];
|
||||
struct fout_factory {
|
||||
std::string fname_out;
|
||||
const size_t basename_length;
|
||||
const bool is_stdout;
|
||||
bool used_stdout;
|
||||
decltype(whisper_print_segment_callback) * const print_segment_callback;
|
||||
std::ofstream fout;
|
||||
|
||||
fout_factory (const std::string & fname_out_, const std::string & fname_inp, whisper_params & params) :
|
||||
fname_out{!fname_out_.empty() ? fname_out_ : fname_inp},
|
||||
basename_length{fname_out.size()},
|
||||
is_stdout{fname_out == "-"},
|
||||
used_stdout{},
|
||||
print_segment_callback{is_stdout ? nullptr : whisper_print_segment_callback} {
|
||||
if (!print_segment_callback) {
|
||||
params.print_progress = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool open(const char * ext, const char * function) {
|
||||
if (is_stdout) {
|
||||
if (used_stdout) {
|
||||
fprintf(stderr, "warning: Not appending multiple file formats to stdout\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
used_stdout = true;
|
||||
#ifdef _WIN32
|
||||
fout = std::ofstream{"CON"};
|
||||
#else
|
||||
fout = std::ofstream{"/dev/stdout"};
|
||||
#endif
|
||||
// Not using fprintf stderr here because it might equal stdout
|
||||
// Also assuming /dev is mounted
|
||||
return true;
|
||||
}
|
||||
|
||||
fname_out.resize(basename_length);
|
||||
fname_out += ext;
|
||||
fout = std::ofstream{fname_out};
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", function, fname_out.c_str());
|
||||
return true;
|
||||
}
|
||||
} fout_factory{f < (int) params.fname_out.size() ? params.fname_out[f] : "", fname_inp, params};
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
|
||||
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1137,11 +1101,6 @@ int main(int argc, char ** argv) {
|
||||
params.tinydiarize ? "tdrz = 1, " : "",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
if (params.print_colors) {
|
||||
fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
|
||||
} else if (params.print_confidence) {
|
||||
fprintf(stderr, "%s: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)\n", __func__);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -1192,16 +1151,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
wparams.suppress_nst = params.suppress_nst;
|
||||
|
||||
wparams.vad = params.vad;
|
||||
wparams.vad_model_path = params.vad_model.c_str();
|
||||
|
||||
wparams.vad_params.threshold = params.vad_threshold;
|
||||
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
||||
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
||||
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
||||
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
||||
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
||||
|
||||
const auto & grammar_parsed = params.grammar_parsed;
|
||||
@ -1220,7 +1169,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = fout_factory.print_segment_callback;
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
@ -1262,26 +1211,54 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// output stuff
|
||||
{
|
||||
// macros to stringify function name
|
||||
#define output_func(func, ext, param, ...) if (param && fout_factory.open(ext, #func)) {\
|
||||
func(ctx, fout_factory.fout, params, __VA_ARGS__); \
|
||||
}
|
||||
#define output_ext(ext, ...) output_func(output_##ext, "." #ext, params.output_##ext, __VA_ARGS__)
|
||||
printf("\n");
|
||||
|
||||
output_ext(txt, pcmf32s);
|
||||
output_ext(vtt, pcmf32s);
|
||||
output_ext(srt, pcmf32s);
|
||||
output_ext(wts, pcmf32s, fname_inp.c_str(), float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, fout_factory.fname_out.c_str());
|
||||
output_ext(csv, pcmf32s);
|
||||
output_func(output_json, ".json", params.output_jsn, pcmf32s);
|
||||
output_ext(lrc, pcmf32s);
|
||||
output_func(output_score, ".score.txt", params.log_score, pcmf32s);
|
||||
// output to text file
|
||||
if (params.output_txt) {
|
||||
const auto fname_txt = fname_out + ".txt";
|
||||
output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
#undef output_ext
|
||||
#undef output_func
|
||||
// output to VTT file
|
||||
if (params.output_vtt) {
|
||||
const auto fname_vtt = fname_out + ".vtt";
|
||||
output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
if (fout_factory.is_stdout && !fout_factory.used_stdout) {
|
||||
fprintf(stderr, "warning: '--output-file -' used without any other '--output-*'");
|
||||
// output to SRT file
|
||||
if (params.output_srt) {
|
||||
const auto fname_srt = fname_out + ".srt";
|
||||
output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
// output to WTS file
|
||||
if (params.output_wts) {
|
||||
const auto fname_wts = fname_out + ".wts";
|
||||
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
|
||||
}
|
||||
|
||||
// output to CSV file
|
||||
if (params.output_csv) {
|
||||
const auto fname_csv = fname_out + ".csv";
|
||||
output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
// output to JSON file
|
||||
if (params.output_jsn) {
|
||||
const auto fname_jsn = fname_out + ".json";
|
||||
output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
|
||||
}
|
||||
|
||||
// output to LRC file
|
||||
if (params.output_lrc) {
|
||||
const auto fname_lrc = fname_out + ".lrc";
|
||||
output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
// output to score file
|
||||
if (params.log_score) {
|
||||
const auto fname_score = fname_out + ".score.txt";
|
||||
output_score(ctx, fname_score.c_str(), params, pcmf32s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,146 +0,0 @@
|
||||
/*! coi-serviceworker v0.1.7 - Guido Zuidhof and contributors, licensed under MIT */
|
||||
let coepCredentialless = false;
|
||||
if (typeof window === 'undefined') {
|
||||
self.addEventListener("install", () => self.skipWaiting());
|
||||
self.addEventListener("activate", (event) => event.waitUntil(self.clients.claim()));
|
||||
|
||||
self.addEventListener("message", (ev) => {
|
||||
if (!ev.data) {
|
||||
return;
|
||||
} else if (ev.data.type === "deregister") {
|
||||
self.registration
|
||||
.unregister()
|
||||
.then(() => {
|
||||
return self.clients.matchAll();
|
||||
})
|
||||
.then(clients => {
|
||||
clients.forEach((client) => client.navigate(client.url));
|
||||
});
|
||||
} else if (ev.data.type === "coepCredentialless") {
|
||||
coepCredentialless = ev.data.value;
|
||||
}
|
||||
});
|
||||
|
||||
self.addEventListener("fetch", function (event) {
|
||||
const r = event.request;
|
||||
if (r.cache === "only-if-cached" && r.mode !== "same-origin") {
|
||||
return;
|
||||
}
|
||||
|
||||
const request = (coepCredentialless && r.mode === "no-cors")
|
||||
? new Request(r, {
|
||||
credentials: "omit",
|
||||
})
|
||||
: r;
|
||||
event.respondWith(
|
||||
fetch(request)
|
||||
.then((response) => {
|
||||
if (response.status === 0) {
|
||||
return response;
|
||||
}
|
||||
|
||||
const newHeaders = new Headers(response.headers);
|
||||
newHeaders.set("Cross-Origin-Embedder-Policy",
|
||||
coepCredentialless ? "credentialless" : "require-corp"
|
||||
);
|
||||
if (!coepCredentialless) {
|
||||
newHeaders.set("Cross-Origin-Resource-Policy", "cross-origin");
|
||||
}
|
||||
newHeaders.set("Cross-Origin-Opener-Policy", "same-origin");
|
||||
|
||||
return new Response(response.body, {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
headers: newHeaders,
|
||||
});
|
||||
})
|
||||
.catch((e) => console.error(e))
|
||||
);
|
||||
});
|
||||
|
||||
} else {
|
||||
(() => {
|
||||
const reloadedBySelf = window.sessionStorage.getItem("coiReloadedBySelf");
|
||||
window.sessionStorage.removeItem("coiReloadedBySelf");
|
||||
const coepDegrading = (reloadedBySelf == "coepdegrade");
|
||||
|
||||
// You can customize the behavior of this script through a global `coi` variable.
|
||||
const coi = {
|
||||
shouldRegister: () => !reloadedBySelf,
|
||||
shouldDeregister: () => false,
|
||||
coepCredentialless: () => true,
|
||||
coepDegrade: () => true,
|
||||
doReload: () => window.location.reload(),
|
||||
quiet: false,
|
||||
...window.coi
|
||||
};
|
||||
|
||||
const n = navigator;
|
||||
const controlling = n.serviceWorker && n.serviceWorker.controller;
|
||||
|
||||
// Record the failure if the page is served by serviceWorker.
|
||||
if (controlling && !window.crossOriginIsolated) {
|
||||
window.sessionStorage.setItem("coiCoepHasFailed", "true");
|
||||
}
|
||||
const coepHasFailed = window.sessionStorage.getItem("coiCoepHasFailed");
|
||||
|
||||
if (controlling) {
|
||||
// Reload only on the first failure.
|
||||
const reloadToDegrade = coi.coepDegrade() && !(
|
||||
coepDegrading || window.crossOriginIsolated
|
||||
);
|
||||
n.serviceWorker.controller.postMessage({
|
||||
type: "coepCredentialless",
|
||||
value: (reloadToDegrade || coepHasFailed && coi.coepDegrade())
|
||||
? false
|
||||
: coi.coepCredentialless(),
|
||||
});
|
||||
if (reloadToDegrade) {
|
||||
!coi.quiet && console.log("Reloading page to degrade COEP.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "coepdegrade");
|
||||
coi.doReload("coepdegrade");
|
||||
}
|
||||
|
||||
if (coi.shouldDeregister()) {
|
||||
n.serviceWorker.controller.postMessage({ type: "deregister" });
|
||||
}
|
||||
}
|
||||
|
||||
// If we're already coi: do nothing. Perhaps it's due to this script doing its job, or COOP/COEP are
|
||||
// already set from the origin server. Also if the browser has no notion of crossOriginIsolated, just give up here.
|
||||
if (window.crossOriginIsolated !== false || !coi.shouldRegister()) return;
|
||||
|
||||
if (!window.isSecureContext) {
|
||||
!coi.quiet && console.log("COOP/COEP Service Worker not registered, a secure context is required.");
|
||||
return;
|
||||
}
|
||||
|
||||
// In some environments (e.g. Firefox private mode) this won't be available
|
||||
if (!n.serviceWorker) {
|
||||
!coi.quiet && console.error("COOP/COEP Service Worker not registered, perhaps due to private mode.");
|
||||
return;
|
||||
}
|
||||
|
||||
n.serviceWorker.register(window.document.currentScript.src).then(
|
||||
(registration) => {
|
||||
!coi.quiet && console.log("COOP/COEP Service Worker registered", registration.scope);
|
||||
|
||||
registration.addEventListener("updatefound", () => {
|
||||
!coi.quiet && console.log("Reloading page to make use of updated COOP/COEP Service Worker.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "updatefound");
|
||||
coi.doReload();
|
||||
});
|
||||
|
||||
// If the registration is active, but it's not controlling the page
|
||||
if (registration.active && !n.serviceWorker.controller) {
|
||||
!coi.quiet && console.log("Reloading page to make use of COOP/COEP Service Worker.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "notcontrolling");
|
||||
coi.doReload();
|
||||
}
|
||||
},
|
||||
(err) => {
|
||||
!coi.quiet && console.error("COOP/COEP Service Worker failed to register:", err);
|
||||
}
|
||||
);
|
||||
})();
|
||||
}
|
@ -36,7 +36,7 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
-s INITIAL_MEMORY=1024MB \
|
||||
-s TOTAL_MEMORY=1024MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap', 'HEAPU8']\" \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
This is a basic Voice Assistant example that accepts voice commands from the microphone.
|
||||
It runs in fully in the browser via WebAseembly.
|
||||
|
||||
Online demo: https://ggerganov.github.io/whisper.cpp/command.wasm
|
||||
Online demo: https://whisper.ggerganov.com/command/
|
||||
|
||||
Terminal version: [examples/command](/examples/command)
|
||||
|
||||
@ -15,23 +15,9 @@ git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j libcommand
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/command.wasm/
|
||||
make -j
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/command.wasm/* /path/to/html/
|
||||
cp bin/libcommand.js /path/to/html/
|
||||
cp bin/libcommand.worker.js /path/to/html/
|
||||
```
|
||||
|
||||
> 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no
|
||||
> longer generated and the worker is embedded in the main JS file. So the worker
|
||||
> file will not be geneated for versions later than `3.1.58`.
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user