mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 17:15:19 +00:00
Compare commits
1 Commits
v1.7.4
...
macros-cvt
Author | SHA1 | Date | |
---|---|---|---|
e0bd97f41f |
@ -1,28 +0,0 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libsdl2-dev wget
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN make base.en
|
||||
|
||||
ENTRYPOINT ["/app/main"]
|
@ -1,40 +0,0 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=12.3.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
WORKDIR /app
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
# Ref: https://stackoverflow.com/a/53464012
|
||||
ENV CUDA_MAIN_VERSION=12.3
|
||||
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||
|
||||
COPY .. .
|
||||
RUN make base.en
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
ENV CUDA_MAIN_VERSION=12.3
|
||||
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg wget cmake \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
@ -1,19 +0,0 @@
|
||||
FROM ubuntu:22.04 AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential wget cmake \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY .. .
|
||||
RUN make base.en
|
||||
|
||||
FROM ubuntu:22.04 AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg libsdl2-dev wget cmake \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
22
.github/workflows/bindings-go.yml
vendored
22
.github/workflows/bindings-go.yml
vendored
@ -1,22 +0,0 @@
|
||||
name: Bindings Tests (Go)
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- bindings/go/**
|
||||
- whisper.h
|
||||
pull_request:
|
||||
paths:
|
||||
- bindings/go/**
|
||||
- whisper.h
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '^1.23'
|
||||
- uses: actions/checkout@v4
|
||||
- run: |
|
||||
cd bindings/go
|
||||
make test
|
55
.github/workflows/bindings-ruby.yml
vendored
55
.github/workflows/bindings-ruby.yml
vendored
@ -1,55 +0,0 @@
|
||||
name: Bindings Tests (Ruby)
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/dr_wav.h
|
||||
pull_request:
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/dr_wav.h
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: bindings/ruby
|
||||
steps:
|
||||
- uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: '3.1'
|
||||
- uses: actions/checkout@v4
|
||||
- run: rake test
|
976
.github/workflows/build.yml
vendored
976
.github/workflows/build.yml
vendored
@ -1,797 +1,185 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
ubuntu_image: "ubuntu:22.04"
|
||||
VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"
|
||||
on: [push]
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/amd64, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake
|
||||
cmake -B build
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-latest-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/arm64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-latest-arm-v7:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/arm/v7]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew update
|
||||
brew install sdl2 cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build
|
||||
# uses: cross-platform-actions/action@v0.24.0
|
||||
# with:
|
||||
# operating_system: freebsd
|
||||
# version: '13.3'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake sdl2 cmake
|
||||
# cmake -B build
|
||||
# cmake --build build --config Release
|
||||
|
||||
ubuntu-latest-gcc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/amd64, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-gcc-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/arm64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-gcc-arm-v7:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/arm/v7]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-clang:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
#arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
# TODO: arm/v7 disabled due to clang bug
|
||||
# https://github.com/ggerganov/whisper.cpp/actions/runs/9657764109/job/26637633042?pr=2256#step:4:1990
|
||||
arch: [linux/amd64, linux/arm64, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y clang build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-latest-gcc-sanitized:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
arch: [linux/amd64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
dwhisper_sycl: [ON]
|
||||
dcmake_c_compiler: [icx]
|
||||
dcmake_cxx_compiler: [icpx]
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl-fp16:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
dwhisper_sycl: [ON]
|
||||
dcmake_c_compiler: [icx]
|
||||
dcmake_cxx_compiler: [icpx]
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
|
||||
- { sys: CLANG64, env: clang-x86_64, build: Release }
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
install: >-
|
||||
base-devel
|
||||
mingw-w64-${{matrix.env}}-toolchain
|
||||
mingw-w64-${{matrix.env}}-cmake
|
||||
mingw-w64-${{matrix.env}}-SDL2
|
||||
mingw-w64-${{matrix.env}}-openblas
|
||||
|
||||
- name: Build using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
- name: Clean after building using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
rm -rf build
|
||||
|
||||
- name: Build using CMake w/ OpenBLAS
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [Win32, x64]
|
||||
sdl2: [ON]
|
||||
include:
|
||||
- arch: Win32
|
||||
s2arc: x86
|
||||
jnaPath: win32-x86
|
||||
- arch: x64
|
||||
s2arc: x64
|
||||
jnaPath: win32-x86-64
|
||||
- sdl2: ON
|
||||
s2ver: 2.28.5
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload dll
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.jnaPath }}_whisper.dll
|
||||
path: build/bin/${{ matrix.build }}/whisper.dll
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: whisper-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
windows-blas:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [Win32, x64]
|
||||
blas: [ON]
|
||||
sdl2: [ON]
|
||||
include:
|
||||
- arch: Win32
|
||||
s2arc: x86
|
||||
- arch: x64
|
||||
s2arc: x64
|
||||
- sdl2: ON
|
||||
s2ver: 2.28.5
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Export GitHub Actions cache environment variables
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
|
||||
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
||||
- name: Install OpenBLAS and pkgconfiglite
|
||||
if: matrix.blas == 'ON'
|
||||
run: |
|
||||
vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
|
||||
choco install pkgconfiglite
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DGGML_BLAS=${{ matrix.blas }}
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
- name: Copy openblas.dll
|
||||
if: matrix.blas == 'ON'
|
||||
run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: whisper-blas-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
windows-cublas:
|
||||
runs-on: windows-2019
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [x64]
|
||||
cublas: [ON]
|
||||
sdl2: [ON]
|
||||
cuda-toolkit: [12.2.0, 11.8.0]
|
||||
include:
|
||||
- arch: x64
|
||||
sdl2: ON
|
||||
sdl2_ver: 2.28.5
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
||||
- name: Install CUDA Toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: '${{ matrix.cuda-toolkit }}'
|
||||
|
||||
- name: Install 7-Zip
|
||||
run: choco install 7zip -y
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
Invoke-WebRequest -Uri https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.sdl2_ver }}/SDL2-devel-${{ matrix.sdl2_ver }}-VC.zip -OutFile sdl2.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" > SDL2_PATH.txt
|
||||
|
||||
- name: Configure CMake
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }} ^
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
|
||||
-DGGML_CUDA=${{ matrix.cublas }} ^
|
||||
-DCMAKE_CUDA_ARCHITECTURES=all ^
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }} ^
|
||||
-DSDL2_DIR="%SDL2_DIR%"
|
||||
|
||||
- name: Build Project
|
||||
shell: cmd
|
||||
run: |
|
||||
cd ./build
|
||||
cmake --build . --config ${{ matrix.build }}
|
||||
|
||||
- name: Copy CUDA DLLs
|
||||
run: |
|
||||
Get-ChildItem "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/" -Filter "*.dll" |
|
||||
Copy-Item -Destination "build/bin/${{ matrix.build }}"
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.arch }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
emscripten:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
||||
- name: Verify
|
||||
run: emcc -v
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
make
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
|
||||
mkdir models/ggml-base.en-encoder.mlmodelc
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DWHISPER_BUILD_EXAMPLES=OFF \
|
||||
-DWHISPER_BUILD_TESTS=OFF \
|
||||
-DWHISPER_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
sudo cmake --install . --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme whisper-Package -destination 'generic/platform=iOS'
|
||||
|
||||
#- name: Build objc example
|
||||
# run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphoneos build
|
||||
|
||||
- name: Build swiftui example
|
||||
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
android:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: whisper
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: zulu
|
||||
java-version: 21
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd whisper/examples/whisper.android
|
||||
./gradlew assembleRelease --no-daemon
|
||||
|
||||
- name: Build with external ggml
|
||||
run: |
|
||||
export PATH_TO_GGML=$PWD/ggml
|
||||
cd whisper/examples/whisper.android
|
||||
./gradlew assembleRelease --no-daemon
|
||||
|
||||
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
|
||||
# android_java:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: set up JDK 11
|
||||
# uses: actions/setup-java@v4
|
||||
# with:
|
||||
# java-version: '11'
|
||||
# distribution: 'temurin'
|
||||
# cache: gradle
|
||||
#
|
||||
# - name: Setup Android SDK
|
||||
# uses: android-actions/setup-android@v3
|
||||
# with:
|
||||
# cmdline-tools-version: 9.0
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd examples/whisper.android.java
|
||||
# chmod +x ./gradlew
|
||||
# ./gradlew assembleRelease
|
||||
|
||||
# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
|
||||
# java:
|
||||
# needs: [ 'windows' ]
|
||||
# runs-on: windows-latest
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Install Java
|
||||
# uses: actions/setup-java@v4
|
||||
# with:
|
||||
# distribution: zulu
|
||||
# java-version: 20
|
||||
#
|
||||
# - name: Download Windows lib
|
||||
# uses: actions/download-artifact@v4
|
||||
# with:
|
||||
# name: win32-x86-64_whisper.dll
|
||||
# path: bindings/java/build/generated/resources/main/win32-x86-64
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# models\download-ggml-model.cmd tiny.en
|
||||
# cd bindings/java
|
||||
# chmod +x ./gradlew
|
||||
# ./gradlew build
|
||||
#
|
||||
# - name: Upload jar
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: whispercpp.jar
|
||||
# path: bindings/java/build/libs/whispercpp-*.jar
|
||||
#
|
||||
# - name: Publish package
|
||||
# if: ${{ github.ref == 'refs/heads/master' }}
|
||||
# uses: gradle/gradle-build-action@v2.4.2
|
||||
# with:
|
||||
# arguments: publish
|
||||
# build-root-directory: bindings/java
|
||||
# env:
|
||||
# MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
|
||||
# MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
|
||||
# PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
# PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
quantize:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test quantize
|
||||
run: |
|
||||
./models/download-ggml-model.sh tiny.en
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
make stream
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew update
|
||||
brew install sdl2
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
make stream
|
||||
|
||||
ubuntu-latest-gcc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
ubuntu-latest-clang:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
ubuntu-latest-gcc-sanitized:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [RelWithDebInfo]
|
||||
arch: [Win32, x64]
|
||||
blas: [ON]
|
||||
sdl2: [ON]
|
||||
include:
|
||||
- arch: Win32
|
||||
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
|
||||
s2arc: x86
|
||||
- arch: x64
|
||||
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
|
||||
s2arc: x64
|
||||
- sdl2: ON
|
||||
s2ver: 2.26.0
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
|
||||
- name: Fetch OpenBLAS
|
||||
if: matrix.blas == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
|
||||
7z x blas.zip -oblas -y
|
||||
copy blas/include/cblas.h .
|
||||
copy blas/include/openblas_config.h .
|
||||
echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
-DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
- name: Copy libopenblas.dll
|
||||
if: matrix.blas == 'ON'
|
||||
run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v1
|
||||
with:
|
||||
name: whisper-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
59
.github/workflows/docker.yml
vendored
59
.github/workflows/docker.yml
vendored
@ -1,59 +0,0 @@
|
||||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
|
||||
#TODO: the cuda image keeps failing - disable for now
|
||||
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
|
||||
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build and push Docker image (versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platform }}
|
||||
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: ${{ matrix.config.platform }}
|
||||
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
48
.github/workflows/examples.yml
vendored
48
.github/workflows/examples.yml
vendored
@ -1,48 +0,0 @@
|
||||
name: Examples Tests
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- examples/addon.node/**
|
||||
- whisper.h
|
||||
pull_request:
|
||||
paths:
|
||||
- examples/addon.node/**
|
||||
- whisper.h
|
||||
|
||||
jobs:
|
||||
addon_node-ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
node-version: [ 16.x, 18.x ]
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v1
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
|
||||
- name: Install package.json dependencies
|
||||
working-directory: ./examples/addon.node
|
||||
run: npm install
|
||||
|
||||
- name: Compile addon.node
|
||||
run: npx cmake-js compile -T addon.node -B Release
|
||||
|
||||
- name: Download test model
|
||||
run: |
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
- name: Test
|
||||
run: |
|
||||
cd examples/addon.node
|
||||
npm run test
|
54
.gitignore
vendored
54
.gitignore
vendored
@ -1,60 +1,24 @@
|
||||
*.o
|
||||
*.a
|
||||
*.d
|
||||
.cache/
|
||||
.coreml/
|
||||
.test/
|
||||
.venv/
|
||||
.vs/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
.vimspector.json
|
||||
/CMakeSettings.json
|
||||
/talk-llama.dSYM/
|
||||
|
||||
build/
|
||||
build-*/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
# SPM
|
||||
.build/
|
||||
.swiftpm
|
||||
*.metallib
|
||||
|
||||
ggml-metal-embed.metal
|
||||
ggml-metal-embed.metal.tmp
|
||||
|
||||
/main
|
||||
/stream
|
||||
/command
|
||||
/talk
|
||||
/talk-llama
|
||||
/bench
|
||||
/quantize
|
||||
/server
|
||||
/lsp
|
||||
|
||||
arm_neon.h
|
||||
main
|
||||
stream
|
||||
command
|
||||
bench
|
||||
sync.sh
|
||||
libwhisper.a
|
||||
libwhisper.so
|
||||
compile_commands.json
|
||||
|
||||
examples/arm_neon.h
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
||||
|
||||
extra/bench-gg.txt
|
||||
|
||||
models/*.mlmodel
|
||||
models/*.mlmodelc
|
||||
models/*.mlpackage
|
||||
bindings/java/.gradle/
|
||||
bindings/java/.idea/
|
||||
.idea/
|
||||
|
||||
benchmark_results.csv
|
||||
cmake-build-debug/
|
||||
.cxx/
|
||||
.gradle/
|
||||
local.properties
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -0,0 +1,3 @@
|
||||
[submodule "bindings/ios"]
|
||||
path = bindings/ios
|
||||
url = https://github.com/ggerganov/whisper.spm
|
||||
|
301
AUTHORS
301
AUTHORS
@ -1,301 +0,0 @@
|
||||
# date: Tue Apr 9 20:27:03 EEST 2024
|
||||
# this file is auto-generated by scripts/gen-authors.sh
|
||||
|
||||
0/0 <zero@imaskeleton.me>
|
||||
0cc4m <picard12@live.de>
|
||||
0xsourcecode <134374803+0xsourcecode@users.noreply.github.com>
|
||||
AT <manyoso@users.noreply.github.com>
|
||||
Aarni Koskela <akx@iki.fi>
|
||||
Aaron Pham <29749331+aarnphm@users.noreply.github.com>
|
||||
Aaron Taylor <aaron@exphat.com>
|
||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||
Abitofevrything <54505189+abitofevrything@users.noreply.github.com>
|
||||
AfryMask <AfryMask@163.com>
|
||||
Ahmad Bilal <ahmad.bilal@empglabs.com>
|
||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||
Akash Mahajan <akash7190@gmail.com>
|
||||
Akash Mahajan <akashmjn@stanford.edu>
|
||||
Al Hoang <3811822-hoanga@users.noreply.gitlab.com>
|
||||
Alan <unknown>
|
||||
Aleksander Andrzejewski <18704749+aleksanderandrzejewski@users.noreply.github.com>
|
||||
Alex Azarov <alex@azarov.by>
|
||||
Alex Bacart <13940752+alex-bacart@users.noreply.github.com>
|
||||
Alex Evgrashin <aevgrashin@yandex.ru>
|
||||
Alexandr Graschenkov <alexandr.graschenkov91@gmail.com>
|
||||
Alexandru Mariuti <alex@mariuti.com>
|
||||
Alexey Kharlamov <alexey@kharlamov.biz>
|
||||
Alfredo Montesinos <alfredo.montesinos@g.austincc.edu>
|
||||
Ali Alameh <ali.alameh@isae.edu.lb>
|
||||
Ananta Bastola <anantarajbastola@gmail.com>
|
||||
Andreu Huguet <andreuhuguet@gmail.com>
|
||||
Andrew Huynh <a5thuynh@gmail.com>
|
||||
Andrew S <andrews54757@gmail.com>
|
||||
Andy Maloney <asmaloney@gmail.com>
|
||||
Anton Kostin <masguit42@users.noreply.github.com>
|
||||
Artyom Mezin <psycho.fading@gmail.com>
|
||||
Asad Memon <asad.lionpk@gmail.com>
|
||||
Ashraful Islam <ashraful.meche@gmail.com>
|
||||
AsukaMinato <asukaminato@nyan.eu.org>
|
||||
AustinMroz <austinmroz@utexas.edu>
|
||||
Avik Sengupta <avik@sengupta.net>
|
||||
Bader-eddine Ouaich <49657842+baderouaich@users.noreply.github.com>
|
||||
Baffin Lee <baffinlee@gmail.com>
|
||||
Ben Nortier <bjnortier@gmail.com>
|
||||
Benjamin Heiniger <benjamin.heiniger@bluewin.ch>
|
||||
Bo-Yi Wu <appleboy.tw@gmail.com>
|
||||
Boris Bliznioukov <blib@mail.com>
|
||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||
Brad Murray <59848399+bradmurray-dt@users.noreply.github.com>
|
||||
Brian Murray <brian@bmurray.ca>
|
||||
CRD716 <crd716@gmail.com>
|
||||
Canis Lupus <Canis-UK@users.noreply.github.com>
|
||||
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||
ChangSeok Oh <shivamidow@users.noreply.github.com>
|
||||
Chaoqun <27287694+OpenWaygate@users.noreply.github.com>
|
||||
Chia-Hsiang Cheng <88014292+garychia@users.noreply.github.com>
|
||||
Chidi Williams <williamschidi1@gmail.com>
|
||||
Christian <12550267+iceychris@users.noreply.github.com>
|
||||
Clifford Heath <clifford.heath@gmail.com>
|
||||
Colin <github@whoisc.cc>
|
||||
DGdev91 <DGdev91@users.noreply.github.com>
|
||||
Damian Czaja <trojan295@protonmail.com>
|
||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
David <dnhkng@gmail.com>
|
||||
David Thorpe <djt@mutablelogic.com>
|
||||
Davidson Francis <davidsondfgl@gmail.com>
|
||||
Dener Stassun <denerstassun@gmail.com>
|
||||
Didzis Gosko <didzis@users.noreply.github.com>
|
||||
Digipom <admin@digipom.com>
|
||||
Dimo <dimo@ieee.org>
|
||||
Dody Suria Wijaya <dodysw@gmail.com>
|
||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||
Duncan McConnell <ddmcconnell4@gmail.com>
|
||||
Egor Egorov <me@egorfine.com>
|
||||
Elkana Bardugo <ttv200@gmail.com>
|
||||
Emmanuel Schmidbauer <eschmidbauer@gmail.com>
|
||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||
Eric Swanson <eswanson@alloscomp.com>
|
||||
Eric Tendian <erictendian@gmail.com>
|
||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||
Evan Jones <evan.q.jones@gmail.com>
|
||||
Evan Martin <evan.martin@gmail.com>
|
||||
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||
Evgeny Kuznetsov <evgeny@kuznetsov.md>
|
||||
F1L1P <78918286+F1L1Pv2@users.noreply.github.com>
|
||||
Fangjun Kuang <csukuangfj@gmail.com>
|
||||
Felix <stenbackfelix@gmail.com>
|
||||
Finn Voorhees <finnvoorhees@gmail.com>
|
||||
FlippFuzz <41221030+FlippFuzz@users.noreply.github.com>
|
||||
Gang Chen <goncha@gmail.com>
|
||||
Gavin Cai <gavin1818@hotmail.com>
|
||||
George Hindle <george@georgehindle.com>
|
||||
Georgi Gerganov <ggerganov@gmail.com>
|
||||
GitAritron <103900385+GitAritron@users.noreply.github.com>
|
||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||
Gleicon Moraes <gleicon@gmail.com>
|
||||
Gregor Jasny <gjasny@googlemail.com>
|
||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||
HY. Kelvin Lee <34256578+hykelvinlee42@users.noreply.github.com>
|
||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||
Hang <bebound@gmail.com>
|
||||
Herman Semenov <GermanAizek@yandex.ru>
|
||||
Hrishikesh Barman <geekodour@users.noreply.github.com>
|
||||
Ian Bicking <ian@ianbicking.org>
|
||||
Ian Bull <irbull@eclipsesource.com>
|
||||
Ikko Ashimine <eltociear@gmail.com>
|
||||
InconsolableCellist <23345188+InconsolableCellist@users.noreply.github.com>
|
||||
Ismatulla Mansurov <47342870+sapoepsilon@users.noreply.github.com>
|
||||
Ivan Gorin <ivangorin21@gmail.com>
|
||||
JJ <103335846+computerscienceiscool@users.noreply.github.com>
|
||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||
JacobLinCool <jacoblincool@gmail.com>
|
||||
Jakub Ráček <blizzcz@gmail.com>
|
||||
Jared Van Bortel <jared@nomic.ai>
|
||||
Jay Binks <jaybinks@gmail.com>
|
||||
Jhen-Jie Hong <developer@jhen.me>
|
||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||
JidongZhang-THU <1119708529@qq.com>
|
||||
Jo Liss <joliss42@gmail.com>
|
||||
Johan <jr.raffin@gmail.com>
|
||||
Johannes Gäßler <johannesg@5d6.de>
|
||||
John Balis <phobossystems@gmail.com>
|
||||
Jonathan Soo <jcsoo@agora.com>
|
||||
Jonno <1160532+razodactyl@users.noreply.github.com>
|
||||
Joonas Pihlajamaa <joonas.pihlajamaa@iki.fi>
|
||||
Jose <34888496+Jerry-Master@users.noreply.github.com>
|
||||
Josh Bleecher Snyder <josharian@gmail.com>
|
||||
Judd <foldl@users.noreply.github.com>
|
||||
Jumper775 <78500318+jumpers775@users.noreply.github.com>
|
||||
Justine Tunney <jtunney@gmail.com>
|
||||
KP Kaiser <kirk@zothcorp.com>
|
||||
Kamilake <exjang0@gmail.com>
|
||||
Kartik Saranathan <278928+Kartiku@users.noreply.github.com>
|
||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||
Kevin Brothaler <admin@digipom.com>
|
||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||
Kreijstal <rainb@tfwno.gf>
|
||||
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
LBlue <153975653+lbluep@users.noreply.github.com>
|
||||
Larry Battle <larry.battle.tech@gmail.com>
|
||||
Laytan Laats <laytanlaats@hotmail.com>
|
||||
Leo Moll <leo.moll@yeasoft.com>
|
||||
Lexevolution <31176843+Lexevolution@users.noreply.github.com>
|
||||
LittleLoli <26589867+WhichWho@users.noreply.github.com>
|
||||
Lucas Zanek <57494138+LucasZNK@users.noreply.github.com>
|
||||
Luis Herrera <herrera-luis@users.noreply.github.com>
|
||||
Lukas Rist <glaslos@gmail.com>
|
||||
M. A. Ali <73258591+MightyStud@users.noreply.github.com>
|
||||
M. Eren Akbiyik <erenakbiyik@gmail.com>
|
||||
Maciek <maciek.mab122@gmail.com>
|
||||
Marcin Mielniczuk <marmistrz.dev@zoho.eu>
|
||||
Martin Warnaar <martinwarnaar@gmail.com>
|
||||
Matheus de Sousa <23645013+keyehzy@users.noreply.github.com>
|
||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||
Matija Pevec <mightymatth@users.noreply.github.com>
|
||||
Maximiliano Levi <8160966+maxilevi@users.noreply.github.com>
|
||||
Meng, Hengyu <hengyu.meng@intel.com>
|
||||
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||
Michael Rienstra <mrienstra@gmail.com>
|
||||
Mikhail Grigorev <sleuthhound@gmail.com>
|
||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||
Mohit Agarwal <mohit@sdf.org>
|
||||
Murilo Santana <mvrilo@gmail.com>
|
||||
Neil Chudleigh <nchudleigh@users.noreply.github.com>
|
||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||
Neuman Vong <neuman.vong@gmail.com>
|
||||
Nicholas Albion <nalbion@yahoo.com>
|
||||
Niels Mayer <Niels.Mayer@gmail.com>
|
||||
Okabintaro <103938900+Okabintaro@users.noreply.github.com>
|
||||
Oleg Sidorov <me@whitebox.io>
|
||||
Oleg Sidorov <oleg@sidorov.nl>
|
||||
Ondrej Kokes <ondrej.kokes@gmail.com>
|
||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||
Philipp Zabel <philipp.zabel@gmail.com>
|
||||
Philippe Normand <phil@base-art.net>
|
||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||
Qianhe Chen <54462604+chenqianhe@users.noreply.github.com>
|
||||
Radosław Gryta <radek.gryta@gmail.com>
|
||||
Reinforce-II <fate@eastal.com>
|
||||
Reinis Muiznieks <muiznieks.reinis@gmail.com>
|
||||
RelatedTitle <r3latedtitle@gmail.com>
|
||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||
Rich Jones <miserlou@gmail.com>
|
||||
Robin <robin.xw@hotmail.com>
|
||||
Roddur Dasgupta <roddurd@gmail.com>
|
||||
Roland Rabien <figbug@gmail.com>
|
||||
Rotem Dan <rotemdan@gmail.com>
|
||||
Ryan Hitchman <hitchmanr@gmail.com>
|
||||
Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com>
|
||||
RyanChang <ftes90015@gmail.com>
|
||||
Sam <49637763+Onlyartist9@users.noreply.github.com>
|
||||
Sam Pullara <spullara@gmail.com>
|
||||
Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
|
||||
Sergio López <slp@sinrega.org>
|
||||
Siddharth Ramakrishnan <srr2141@columbia.edu>
|
||||
Simon Moisselin <simon.moisstoll@gmail.com>
|
||||
Sindre Sorhus <sindresorhus@gmail.com>
|
||||
Slava Primenko <primenko.s@gmail.com>
|
||||
Syahmi Azhar <prsyahmi@gmail.com>
|
||||
Syed Jafri <syedjafri97@gmail.com>
|
||||
Sơn Phan Trung <phantrungson17@gmail.com>
|
||||
Taisei Mima <bhbstar.me@gmail.com>
|
||||
Takeshi Inoue <inoue.takeshi@gmail.com>
|
||||
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||
Taras Glek <taras@thegp.com>
|
||||
Tauseef Mohiuddin <35351464+tauseefmohammed2@users.noreply.github.com>
|
||||
Thijs Raymakers <thijs@raymakers.nl>
|
||||
Thomas Fitzsimmons <fitzsim@fitzsim.org>
|
||||
Tiago Fassoni <tiagofassoni@users.noreply.github.com>
|
||||
Tienshiao Ma <tienshiao@tienshiao.org>
|
||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||
Tobrun <tobrun.van.nuland@gmail.com>
|
||||
Todd <taf2@users.noreply.github.com>
|
||||
Tong Li <31761981+litongjava@users.noreply.github.com>
|
||||
Topping1 <78745143+Topping1@users.noreply.github.com>
|
||||
Travis Cline <travis.cline@gmail.com>
|
||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||
Vadim Peretokin <vperetokin@hey.com>
|
||||
Valentin Gosu <1454649+valenting@users.noreply.github.com>
|
||||
Vulcan <93451215+trholding@users.noreply.github.com>
|
||||
WhiteOlivierus <36532695+WhiteOlivierus@users.noreply.github.com>
|
||||
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||
Yajing Tang <phillis@google.com>
|
||||
Yang Shen <aplshenyang@gmail.com>
|
||||
Yunès <jean.baptiste.yunes@free.fr>
|
||||
ZaBlazzingZephyrus <119159668+blazingzephyr@users.noreply.github.com>
|
||||
Zigfrid Zvezdin <ziggerZZ@gmail.com>
|
||||
Zollner <24618122+Zolliner@users.noreply.github.com>
|
||||
ai-at-home <149282006+ai-at-home@users.noreply.github.com>
|
||||
alonfaraj <alonfaraj@gmail.com>
|
||||
andypayne <apayne@gmail.com>
|
||||
ardfork <134447697+ardfork@users.noreply.github.com>
|
||||
automaticcat <daogiatuank54@gmail.com>
|
||||
be-next <jerome.ramette@gmail.com>
|
||||
bert hubert <bert@hubertnet.nl>
|
||||
bmwl <brian.marshall@tolko.com>
|
||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||
bocytko <bocytko+github@gmail.com>
|
||||
boolemancer <48014766+boolemancer@users.noreply.github.com>
|
||||
boolemancer <boolemancer@gmail.com>
|
||||
bradmit <151883577+bradmit@users.noreply.github.com>
|
||||
brunofaustino <b.fa.amorim@gmail.com>
|
||||
bssrdf <merlintiger@hotmail.com>
|
||||
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||
cdosoftei <ciprian.dosoftei@gmail.com>
|
||||
clach04 <Chris.Clark@actian.com>
|
||||
compilade <113953597+compilade@users.noreply.github.com>
|
||||
conradg <conradjgodfrey@gmail.com>
|
||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||
denersc <denerstassun@gmail.com>
|
||||
dscripka <dscripka@users.noreply.github.com>
|
||||
duthils <duthils@duthils.net>
|
||||
ecneladis <ecneladis@users.noreply.github.com>
|
||||
faker <nspyia2002@gmail.com>
|
||||
fitzsim <fitzsim@fitzsim.org>
|
||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||
genevera (she/her) <genevera@users.noreply.github.com>
|
||||
geniusnut <geniusnut@gmail.com>
|
||||
greeshmay <greeshmay@gmail.com>
|
||||
hydai <z54981220@gmail.com>
|
||||
iamthad <thadeus.j.fleming@gmail.com>
|
||||
james wolf <contractorwolf@hotmail.com>
|
||||
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||
jorismertz <35079666+jorismertz@users.noreply.github.com>
|
||||
junkfood <69683722+JunkFood02@users.noreply.github.com>
|
||||
jwijffels <jwijffels@bnosac.be>
|
||||
kamranjon <kamranjon@gmail.com>
|
||||
katsu560 <katsu560oo-@docomo.ne.jp>
|
||||
kennethge <57784063+kenneth-ge@users.noreply.github.com>
|
||||
keyehzy <msamuel@aluno.puc-rio.br>
|
||||
leejet <leejet714@gmail.com>
|
||||
litong <31761981+litongjava@users.noreply.github.com>
|
||||
lnyan <lkwq007@gmail.com>
|
||||
m.bell <m.bell@techsmith.com>
|
||||
mkiol <mkiol@users.noreply.github.com>
|
||||
novag <7754358+novag@users.noreply.github.com>
|
||||
pajowu <pajowu@pajowu.de>
|
||||
polarmoon <90010972+polarmoon@users.noreply.github.com>
|
||||
rlapray <lapray.romain@gmail.com>
|
||||
sandrohanea <40202887+sandrohanea@users.noreply.github.com>
|
||||
semiformal-net <84111142+semiformal-net@users.noreply.github.com>
|
||||
shibukazu <61775791+shibukazu@users.noreply.github.com>
|
||||
shikokuchuo <53399081+shikokuchuo@users.noreply.github.com>
|
||||
slaren <slarengh@gmail.com>
|
||||
slashlib <slashlib@users.noreply.github.com>
|
||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||
st-gr <38470677+st-gr@users.noreply.github.com>
|
||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||
thefinaldegree <thefinaldegree@gmail.com>
|
||||
trixirt <trix@redhat.com>
|
||||
ulatekh <ulatekh@yahoo.com>
|
||||
undef <undefdev@gmail.com>
|
||||
venkr <venkateshrameshkumar+1@gmail.com>
|
||||
vicalloy <zbirder@gmail.com>
|
||||
xdrudis <xavierdrudis@yahoo.es>
|
||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||
布客飞龙 <562826179@qq.com>
|
||||
Артём Земляк <azemlyak@smart-consulting.ru>
|
315
CMakeLists.txt
315
CMakeLists.txt
@ -1,32 +1,19 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
|
||||
#set(CMAKE_WARN_DEPRECATED YES)
|
||||
set(CMAKE_WARN_UNUSED_CLI YES)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
cmake_minimum_required (VERSION 3.0)
|
||||
project(whisper.cpp VERSION 1.0.0)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
|
||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(WHISPER_STANDALONE ON)
|
||||
|
||||
include(git-vars)
|
||||
include(cmake/GitVars.cmake)
|
||||
include(cmake/BuildTypes.cmake)
|
||||
|
||||
# configure project version
|
||||
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
|
||||
if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
|
||||
configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
|
||||
endif()
|
||||
else()
|
||||
set(WHISPER_STANDALONE OFF)
|
||||
endif()
|
||||
@ -35,11 +22,6 @@ if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
|
||||
|
||||
# TODO: without these, we get the following error:
|
||||
# wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
else()
|
||||
if (MINGW)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
@ -48,138 +30,205 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
# options
|
||||
|
||||
#
|
||||
# option list
|
||||
#
|
||||
option(BUILD_SHARED_LIBS "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
|
||||
# general
|
||||
option(WHISPER_CCACHE "whisper: use ccache if available" ON)
|
||||
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
|
||||
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
|
||||
|
||||
# debug
|
||||
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
|
||||
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
|
||||
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
|
||||
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
|
||||
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
|
||||
|
||||
# build
|
||||
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
|
||||
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
|
||||
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
|
||||
|
||||
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
|
||||
|
||||
if (APPLE)
|
||||
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
||||
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
|
||||
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
|
||||
else()
|
||||
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
||||
endif()
|
||||
|
||||
option(WHISPER_PERF "whisper: enable perf timings" OFF)
|
||||
|
||||
# sanitizers
|
||||
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
|
||||
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
|
||||
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
|
||||
|
||||
# extra artifacts
|
||||
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
|
||||
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
|
||||
option(WHISPER_BUILD_SERVER "whisper: build server example" ${WHISPER_STANDALONE})
|
||||
|
||||
# 3rd party libs
|
||||
option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
|
||||
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
|
||||
endif()
|
||||
|
||||
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
||||
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
|
||||
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_CCACHE ${WHISPER_CCACHE})
|
||||
set(GGML_SANITIZE_THREAD ${WHISPER_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${WHISPER_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
|
||||
set(GGML_ALL_WARNINGS ${WHISPER_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${WHISPER_FATAL_WARNINGS})
|
||||
|
||||
# transition helpers
|
||||
function (whisper_option_depr TYPE OLD NEW)
|
||||
if (${OLD})
|
||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||
set(${NEW} ON)
|
||||
if (NOT MSVC)
|
||||
if (WHISPER_SANITIZE_THREAD)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
whisper_option_depr(FATAL_ERROR WHISPER_CUBLAS GGML_CUDA)
|
||||
whisper_option_depr(WARNING WHISPER_CUDA GGML_CUDA)
|
||||
whisper_option_depr(WARNING WHISPER_KOMPUTE GGML_KOMPUTE)
|
||||
whisper_option_depr(WARNING WHISPER_METAL GGML_METAL)
|
||||
whisper_option_depr(WARNING WHISPER_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
whisper_option_depr(WARNING WHISPER_NATIVE GGML_NATIVE)
|
||||
whisper_option_depr(WARNING WHISPER_OPENMP GGML_OPENMP)
|
||||
whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
|
||||
if (WHISPER_SANITIZE_ADDRESS)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
if (WHISPER_SANITIZE_UNDEFINED)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
|
||||
|
||||
# dependencies
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# on APPLE - include Accelerate framework
|
||||
if (APPLE AND NOT WHISPER_NO_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
message(STATUS "Accelerate framework found")
|
||||
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WHISPER_SUPPORT_OPENBLAS)
|
||||
find_library(OPENBLAS_LIB
|
||||
NAMES openblas libopenblas
|
||||
)
|
||||
if (OPENBLAS_LIB)
|
||||
message(STATUS "OpenBLAS found")
|
||||
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${OPENBLAS_LIB})
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# compiler flags
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
|
||||
endif ()
|
||||
|
||||
if (WHISPER_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
|
||||
-Wall \
|
||||
-Wextra \
|
||||
-Wpedantic \
|
||||
-Wshadow \
|
||||
-Wcast-qual \
|
||||
-Wstrict-prototypes \
|
||||
-Wpointer-arith \
|
||||
")
|
||||
else()
|
||||
# todo : msvc
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
|
||||
endif()
|
||||
|
||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||
message(STATUS "ARM detected")
|
||||
else()
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
||||
else()
|
||||
if (EMSCRIPTEN)
|
||||
# we require support for WASM SIMD 128-bit
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
else()
|
||||
if(NOT WHISPER_NO_AVX)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
||||
endif()
|
||||
if(NOT WHISPER_NO_AVX2)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
|
||||
endif()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WHISPER_PERF)
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
||||
endif()
|
||||
add_subdirectory(src)
|
||||
|
||||
#
|
||||
# install
|
||||
# whisper - this is the main library of the project
|
||||
#
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
set(TARGET whisper)
|
||||
|
||||
set(WHISPER_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(WHISPER_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(WHISPER_INSTALL_VERSION ${CMAKE_PROJECT_VERSION})
|
||||
add_library(${TARGET}
|
||||
ggml.c
|
||||
whisper.cpp
|
||||
)
|
||||
|
||||
set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(WHISPER_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
target_include_directories(${TARGET} PUBLIC
|
||||
.
|
||||
)
|
||||
|
||||
get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
|
||||
if (MSVC)
|
||||
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
|
||||
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
|
||||
else()
|
||||
target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper
|
||||
PATH_VARS
|
||||
WHISPER_INCLUDE_INSTALL_DIR
|
||||
WHISPER_LIB_INSTALL_DIR
|
||||
WHISPER_BIN_INSTALL_DIR )
|
||||
if (BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${TARGET} PUBLIC
|
||||
${CMAKE_DL_LIBS}
|
||||
)
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
|
||||
VERSION ${WHISPER_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
target_compile_definitions(${TARGET} PUBLIC
|
||||
WHISPER_SHARED
|
||||
)
|
||||
endif()
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
|
||||
target_compile_definitions(${TARGET} PUBLIC
|
||||
${WHISPER_EXTRA_FLAGS}
|
||||
)
|
||||
|
||||
configure_file(cmake/whisper.pc.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
@ONLY)
|
||||
install(TARGETS ${TARGET}
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib/static
|
||||
)
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
DESTINATION lib/pkgconfig)
|
||||
#
|
||||
# bindings
|
||||
#
|
||||
|
||||
add_subdirectory(bindings)
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
#include(CTest)
|
||||
#add_subdirectory(tests)
|
||||
endif ()
|
||||
if (WHISPER_STANDALONE)
|
||||
if (WHISPER_BUILD_TESTS)
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
endif ()
|
||||
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
Copyright (c) 2022 Georgi Gerganov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
186
Makefile
186
Makefile
@ -1,12 +1,163 @@
|
||||
ifndef UNAME_S
|
||||
UNAME_S := $(shell uname -s)
|
||||
endif
|
||||
|
||||
ifndef UNAME_P
|
||||
UNAME_P := $(shell uname -p)
|
||||
endif
|
||||
|
||||
ifndef UNAME_M
|
||||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
# Mac OS + Arm can report x86_64
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
ifneq ($(UNAME_P),arm)
|
||||
SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
|
||||
ifeq ($(SYSCTL_M),1)
|
||||
# UNAME_P := arm
|
||||
# UNAME_M := arm64
|
||||
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
|
||||
CFLAGS = -I. -O3 -std=c11
|
||||
CXXFLAGS = -I. -I./examples -O3 -std=c++11
|
||||
LDFLAGS =
|
||||
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),x86_64)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -mfma -mf16c
|
||||
AVX1_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell grep "fma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(UNAME_M),amd64)
|
||||
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
||||
endif
|
||||
ifndef WHISPER_NO_ACCELERATE
|
||||
# Mac M1 - include Accelerate framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -DGGML_USE_ACCELERATE
|
||||
LDFLAGS += -framework Accelerate
|
||||
endif
|
||||
endif
|
||||
ifdef WHISPER_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
ifdef WHISPER_GPROF
|
||||
CFLAGS += -pg
|
||||
CXXFLAGS += -pg
|
||||
endif
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
endif
|
||||
ifneq ($(filter armv6%,$(UNAME_M)),)
|
||||
# Raspberry Pi 1, 2, 3
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
ifneq ($(filter armv7%,$(UNAME_M)),)
|
||||
# Raspberry Pi 4
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
endif
|
||||
ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
# Raspberry Pi 4
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
default: main
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
|
||||
whisper.o: whisper.cpp whisper.h
|
||||
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
|
||||
|
||||
libwhisper.a: ggml.o whisper.o
|
||||
$(AR) rcs libwhisper.a ggml.o whisper.o
|
||||
|
||||
libwhisper.so: ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f *.o main stream command bench libwhisper.a libwhisper.so
|
||||
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
|
||||
CC_SDL=`sdl2-config --cflags --libs`
|
||||
|
||||
main: examples/main/main.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
stream: examples/stream/stream.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
command: examples/command/command.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
bench: examples/bench/bench.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Audio samples
|
||||
#
|
||||
|
||||
.PHONY: build
|
||||
build:
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
|
||||
# download a few audio samples into folder "./samples":
|
||||
.PHONY: samples
|
||||
samples:
|
||||
@ -16,19 +167,12 @@ samples:
|
||||
@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
|
||||
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
|
||||
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
|
||||
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
|
||||
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
|
||||
@echo "Converting to 16-bit WAV ..."
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
|
||||
@rm samples/*.ogg
|
||||
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
|
||||
@rm samples/mm1.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
|
||||
@rm samples/a13.mp3
|
||||
@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
|
||||
@rm samples/diffusion2023-07-03.flac
|
||||
|
||||
#
|
||||
# Models
|
||||
@ -46,14 +190,10 @@ samples:
|
||||
.PHONY: medium.en
|
||||
.PHONY: medium
|
||||
.PHONY: large-v1
|
||||
.PHONY: large-v2
|
||||
.PHONY: large-v3
|
||||
.PHONY: large-v3-turbo
|
||||
.PHONY: large
|
||||
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo:
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
|
||||
bash ./models/download-ggml-model.sh $@
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
@echo ""
|
||||
@echo "==============================================="
|
||||
@echo "Running $@ on all samples in ./samples ..."
|
||||
@ -64,6 +204,14 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 larg
|
||||
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "" ; \
|
||||
./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
|
||||
./main -m models/ggml-$@.bin -f $$f ; \
|
||||
echo "" ; \
|
||||
done
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
.PHONY: tests
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
|
@ -1,19 +0,0 @@
|
||||
// swift-tools-version:5.5
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "whisper",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.iOS(.v14),
|
||||
.watchOS(.v4),
|
||||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "whisper", targets: ["whisper"]),
|
||||
],
|
||||
targets: [
|
||||
.systemLibrary(name: "whisper", pkgConfig: "whisper"),
|
||||
]
|
||||
)
|
677
README.md
677
README.md
@ -1,44 +1,33 @@
|
||||
# whisper.cpp
|
||||
|
||||

|
||||
|
||||
[](https://github.com/ggerganov/whisper.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
- Plain C/C++ implementation without dependencies
|
||||
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
|
||||
- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
|
||||
- AVX intrinsics support for x86 architectures
|
||||
- VSX intrinsics support for POWER architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- [Integer quantization support](#quantization)
|
||||
- Low memory usage (Flash Attention + Flash Forward)
|
||||
- Zero memory allocations at runtime
|
||||
- [Vulkan support](#vulkan-gpu-support)
|
||||
- Support for CPU-only inference
|
||||
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
|
||||
- [OpenVINO Support](#openvino-support)
|
||||
- [Ascend NPU Support](#ascend-npu-support)
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
|
||||
- Runs on the CPU
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
|
||||
|
||||
Supported platforms:
|
||||
|
||||
- [x] Mac OS (Intel and Arm)
|
||||
- [x] [iOS](examples/whisper.objc)
|
||||
- [x] [Android](examples/whisper.android)
|
||||
- [x] [Java](bindings/java/README.md)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] Linux
|
||||
- [x] [WebAssembly](examples/whisper.wasm)
|
||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
- [x] [Android](https://github.com/ggerganov/whisper.cpp/issues/30)
|
||||
|
||||
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
|
||||
The entire implementation of the model is contained in 2 source files:
|
||||
|
||||
- Tensor operations: [ggml.h](ggml.h) / [ggml.c](ggml.c)
|
||||
- Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
|
||||
|
||||
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
|
||||
@ -49,53 +38,150 @@ You can also easily make your own offline voice assistant application: [command]
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
|
||||
|
||||
On Apple Silicon, the inference runs fully on the GPU via Metal:
|
||||
Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
|
||||
|
||||
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
## Implementation details
|
||||
|
||||
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
|
||||
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
|
||||
- Sample usage is demonstrated in [main.cpp](examples/main)
|
||||
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
|
||||
- Various other examples are available in the [examples](examples) folder
|
||||
|
||||
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
|
||||
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
||||
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
||||
|
||||
## Limitations
|
||||
|
||||
- Inference only
|
||||
- No GPU support
|
||||
- Very basic greedy sampling scheme - always pick up the token with highest probability.
|
||||
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
|
||||
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
|
||||
to run the python code with the following parameters:
|
||||
|
||||
```
|
||||
whisper --best_of None --beam_size None ...
|
||||
```
|
||||
|
||||
In the future, `whisper.cpp` will support more sampling strategies.
|
||||
|
||||
## Quick start
|
||||
|
||||
First clone the repository:
|
||||
First, download one of the Whisper models converted in [ggml format](models). For example:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
```
|
||||
|
||||
Navigate into the directory:
|
||||
|
||||
```
|
||||
cd whisper.cpp
|
||||
```
|
||||
|
||||
Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
|
||||
Now build the [main](examples/main) example and transcribe an audio file like this:
|
||||
|
||||
```bash
|
||||
sh ./models/download-ggml-model.sh base.en
|
||||
```
|
||||
|
||||
Now build the [whisper-cli](examples/cli) example and transcribe an audio file like this:
|
||||
|
||||
```bash
|
||||
# build the project
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
# build the main example
|
||||
make
|
||||
|
||||
# transcribe an audio file
|
||||
./build/bin/whisper-cli -f samples/jfk.wav
|
||||
./main -f input.wav
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
For a quick demo, simply run `make base.en`.
|
||||
For a quick demo, simply run `make base.en`:
|
||||
|
||||
```java
|
||||
$ make base.en
|
||||
|
||||
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
|
||||
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
|
||||
c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o ggml.o -o main -framework Accelerate
|
||||
./main -h
|
||||
|
||||
usage: ./main [options] file0.wav file1.wav ...
|
||||
|
||||
options:
|
||||
-h, --help [default] show this help message and exit
|
||||
-t N, --threads N [4 ] number of threads to use during computation
|
||||
-p N, --processors N [1 ] number of processors to use during computation
|
||||
-ot N, --offset-t N [0 ] time offset in milliseconds
|
||||
-on N, --offset-n N [0 ] segment index offset
|
||||
-d N, --duration N [0 ] duration of audio to process in milliseconds
|
||||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
|
||||
-ml N, --max-len N [0 ] maximum segment length in characters
|
||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
|
||||
-tr, --translate [false ] translate from source language to english
|
||||
-otxt, --output-txt [false ] output result in a text file
|
||||
-ovtt, --output-vtt [false ] output result in a vtt file
|
||||
-osrt, --output-srt [false ] output result in a srt file
|
||||
-owts, --output-words [false ] output script for generating karaoke video
|
||||
-ps, --print-special [false ] print special tokens
|
||||
-pc, --print-colors [false ] print colors
|
||||
-nt, --no-timestamps [true ] do not print timestamps
|
||||
-l LANG, --language LANG [en ] spoken language
|
||||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||
-f FNAME, --file FNAME [ ] input WAV file path
|
||||
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
Downloading ggml model base.en ...
|
||||
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
|
||||
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
|
||||
You can now use it like this:
|
||||
|
||||
$ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
|
||||
|
||||
===============================================
|
||||
Running base.en on all samples in ./samples ...
|
||||
===============================================
|
||||
|
||||
----------------------------------------------
|
||||
[+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
|
||||
----------------------------------------------
|
||||
|
||||
whisper_model_load: loading model from 'models/ggml-base.en.bin'
|
||||
whisper_model_load: n_vocab = 51864
|
||||
whisper_model_load: n_audio_ctx = 1500
|
||||
whisper_model_load: n_audio_state = 512
|
||||
whisper_model_load: n_audio_head = 8
|
||||
whisper_model_load: n_audio_layer = 6
|
||||
whisper_model_load: n_text_ctx = 448
|
||||
whisper_model_load: n_text_state = 512
|
||||
whisper_model_load: n_text_head = 8
|
||||
whisper_model_load: n_text_layer = 6
|
||||
whisper_model_load: n_mels = 80
|
||||
whisper_model_load: f16 = 1
|
||||
whisper_model_load: type = 2
|
||||
whisper_model_load: adding 1607 extra tokens
|
||||
whisper_model_load: mem_required = 506.00 MB
|
||||
whisper_model_load: ggml ctx size = 140.60 MB
|
||||
whisper_model_load: memory size = 22.83 MB
|
||||
whisper_model_load: model size = 140.54 MB
|
||||
|
||||
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
|
||||
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
|
||||
|
||||
|
||||
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
||||
|
||||
|
||||
whisper_print_timings: load time = 105.91 ms
|
||||
whisper_print_timings: mel time = 24.62 ms
|
||||
whisper_print_timings: sample time = 3.63 ms
|
||||
whisper_print_timings: encode time = 324.71 ms / 54.12 ms per layer
|
||||
whisper_print_timings: decode time = 83.58 ms / 13.93 ms per layer
|
||||
whisper_print_timings: total time = 542.81 ms
|
||||
```
|
||||
|
||||
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
|
||||
|
||||
For detailed usage instructions, run: `./build/bin/whisper-cli -h`
|
||||
For detailed usage instructions, run: `./main -h`
|
||||
|
||||
Note that the [whisper-cli](examples/cli) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
|
||||
Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
|
||||
For example, you can use `ffmpeg` like this:
|
||||
|
||||
```bash
|
||||
```java
|
||||
ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
|
||||
```
|
||||
|
||||
@ -104,7 +190,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
|
||||
If you want some extra audio samples to play with, simply run:
|
||||
|
||||
```
|
||||
make -j samples
|
||||
make samples
|
||||
```
|
||||
|
||||
This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
|
||||
@ -112,278 +198,112 @@ This will download a few more audio files from Wikipedia and convert them to 16-
|
||||
You can download and run the other models as follows:
|
||||
|
||||
```
|
||||
make -j tiny.en
|
||||
make -j tiny
|
||||
make -j base.en
|
||||
make -j base
|
||||
make -j small.en
|
||||
make -j small
|
||||
make -j medium.en
|
||||
make -j medium
|
||||
make -j large-v1
|
||||
make -j large-v2
|
||||
make -j large-v3
|
||||
make -j large-v3-turbo
|
||||
make tiny.en
|
||||
make tiny
|
||||
make base.en
|
||||
make base
|
||||
make small.en
|
||||
make small
|
||||
make medium.en
|
||||
make medium
|
||||
make large-v1
|
||||
make large
|
||||
```
|
||||
|
||||
## Memory usage
|
||||
|
||||
| Model | Disk | Mem |
|
||||
| ------ | ------- | ------- |
|
||||
| tiny | 75 MiB | ~273 MB |
|
||||
| base | 142 MiB | ~388 MB |
|
||||
| small | 466 MiB | ~852 MB |
|
||||
| medium | 1.5 GiB | ~2.1 GB |
|
||||
| large | 2.9 GiB | ~3.9 GB |
|
||||
| Model | Disk | Mem | SHA |
|
||||
| --- | --- | --- | --- |
|
||||
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
||||
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||
|
||||
## Quantization
|
||||
## Another example
|
||||
|
||||
`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
|
||||
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
|
||||
Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
|
||||
in about half a minute on a MacBook M1 Pro, using `medium.en` model:
|
||||
|
||||
Here are the steps for creating and using a quantized model:
|
||||
<details>
|
||||
<summary>Expand to see the result</summary>
|
||||
|
||||
```bash
|
||||
# quantize a model with Q5_0 method
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
|
||||
```java
|
||||
$ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
|
||||
|
||||
# run the examples as usual, specifying the quantized model file
|
||||
./build/bin/whisper-cli -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
|
||||
whisper_model_load: loading model from 'models/ggml-medium.en.bin'
|
||||
whisper_model_load: n_vocab = 51864
|
||||
whisper_model_load: n_audio_ctx = 1500
|
||||
whisper_model_load: n_audio_state = 1024
|
||||
whisper_model_load: n_audio_head = 16
|
||||
whisper_model_load: n_audio_layer = 24
|
||||
whisper_model_load: n_text_ctx = 448
|
||||
whisper_model_load: n_text_state = 1024
|
||||
whisper_model_load: n_text_head = 16
|
||||
whisper_model_load: n_text_layer = 24
|
||||
whisper_model_load: n_mels = 80
|
||||
whisper_model_load: f16 = 1
|
||||
whisper_model_load: type = 4
|
||||
whisper_model_load: mem_required = 2610.00 MB
|
||||
whisper_model_load: adding 1607 extra tokens
|
||||
whisper_model_load: ggml ctx size = 1644.97 MB
|
||||
whisper_model_load: memory size = 182.62 MB
|
||||
whisper_model_load: model size = 1462.12 MB
|
||||
|
||||
main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...
|
||||
|
||||
[00:00.000 --> 00:08.000] My fellow Americans, this day has brought terrible news and great sadness to our country.
|
||||
[00:08.000 --> 00:17.000] At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
|
||||
[00:17.000 --> 00:23.000] A short time later, debris was seen falling from the skies above Texas.
|
||||
[00:23.000 --> 00:29.000] The Columbia's lost. There are no survivors.
|
||||
[00:29.000 --> 00:32.000] On board was a crew of seven.
|
||||
[00:32.000 --> 00:39.000] Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
|
||||
[00:39.000 --> 00:48.000] Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
|
||||
[00:48.000 --> 00:52.000] a colonel in the Israeli Air Force.
|
||||
[00:52.000 --> 00:58.000] These men and women assumed great risk in the service to all humanity.
|
||||
[00:58.000 --> 01:03.000] In an age when space flight has come to seem almost routine,
|
||||
[01:03.000 --> 01:07.000] it is easy to overlook the dangers of travel by rocket
|
||||
[01:07.000 --> 01:12.000] and the difficulties of navigating the fierce outer atmosphere of the Earth.
|
||||
[01:12.000 --> 01:18.000] These astronauts knew the dangers, and they faced them willingly,
|
||||
[01:18.000 --> 01:23.000] knowing they had a high and noble purpose in life.
|
||||
[01:23.000 --> 01:31.000] Because of their courage and daring and idealism, we will miss them all the more.
|
||||
[01:31.000 --> 01:36.000] All Americans today are thinking as well of the families of these men and women
|
||||
[01:36.000 --> 01:40.000] who have been given this sudden shock and grief.
|
||||
[01:40.000 --> 01:45.000] You're not alone. Our entire nation grieves with you,
|
||||
[01:45.000 --> 01:52.000] and those you love will always have the respect and gratitude of this country.
|
||||
[01:52.000 --> 01:56.000] The cause in which they died will continue.
|
||||
[01:56.000 --> 02:04.000] Mankind is led into the darkness beyond our world by the inspiration of discovery
|
||||
[02:04.000 --> 02:11.000] and the longing to understand. Our journey into space will go on.
|
||||
[02:11.000 --> 02:16.000] In the skies today, we saw destruction and tragedy.
|
||||
[02:16.000 --> 02:22.000] Yet farther than we can see, there is comfort and hope.
|
||||
[02:22.000 --> 02:29.000] In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
|
||||
[02:29.000 --> 02:35.000] who created all these. He who brings out the starry hosts one by one
|
||||
[02:35.000 --> 02:39.000] and calls them each by name."
|
||||
[02:39.000 --> 02:46.000] Because of His great power and mighty strength, not one of them is missing.
|
||||
[02:46.000 --> 02:55.000] The same Creator who names the stars also knows the names of the seven souls we mourn today.
|
||||
[02:55.000 --> 03:01.000] The crew of the shuttle Columbia did not return safely to earth,
|
||||
[03:01.000 --> 03:05.000] yet we can pray that all are safely home.
|
||||
[03:05.000 --> 03:13.000] May God bless the grieving families, and may God continue to bless America.
|
||||
[03:13.000 --> 03:41.000] Audio
|
||||
|
||||
|
||||
whisper_print_timings: load time = 575.92 ms
|
||||
whisper_print_timings: mel time = 230.60 ms
|
||||
whisper_print_timings: sample time = 73.19 ms
|
||||
whisper_print_timings: encode time = 19552.61 ms / 814.69 ms per layer
|
||||
whisper_print_timings: decode time = 13249.96 ms / 552.08 ms per layer
|
||||
whisper_print_timings: total time = 33686.27 ms
|
||||
```
|
||||
|
||||
## Core ML support
|
||||
|
||||
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
|
||||
speed-up - more than x3 faster compared with CPU-only execution. Here are the instructions for generating a Core ML model and using it with `whisper.cpp`:
|
||||
|
||||
- Install Python dependencies needed for the creation of the Core ML model:
|
||||
|
||||
```bash
|
||||
pip install ane_transformers
|
||||
pip install openai-whisper
|
||||
pip install coremltools
|
||||
```
|
||||
|
||||
- To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
|
||||
- Python 3.10 is recommended.
|
||||
- MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
|
||||
- [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
|
||||
- To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
|
||||
- To activate the environment, use: `conda activate py310-whisper`
|
||||
|
||||
- Generate a Core ML model. For example, to generate a `base.en` model, use:
|
||||
|
||||
```bash
|
||||
./models/generate-coreml-model.sh base.en
|
||||
```
|
||||
|
||||
This will generate the folder `models/ggml-base.en-encoder.mlmodelc`
|
||||
|
||||
- Build `whisper.cpp` with Core ML support:
|
||||
|
||||
```bash
|
||||
# using CMake
|
||||
cmake -B build -DWHISPER_COREML=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
- Run the examples as usual. For example:
|
||||
|
||||
```text
|
||||
$ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
|
||||
...
|
||||
|
||||
whisper_init_state: loading Core ML model from 'models/ggml-base.en-encoder.mlmodelc'
|
||||
whisper_init_state: first run on a device may take a while ...
|
||||
whisper_init_state: Core ML model loaded
|
||||
|
||||
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
|
||||
Next runs are faster.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## OpenVINO support
|
||||
|
||||
On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
|
||||
on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
|
||||
|
||||
This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
|
||||
|
||||
- First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
|
||||
|
||||
Windows:
|
||||
|
||||
```powershell
|
||||
cd models
|
||||
python -m venv openvino_conv_env
|
||||
openvino_conv_env\Scripts\activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-openvino.txt
|
||||
```
|
||||
|
||||
Linux and macOS:
|
||||
|
||||
```bash
|
||||
cd models
|
||||
python3 -m venv openvino_conv_env
|
||||
source openvino_conv_env/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-openvino.txt
|
||||
```
|
||||
|
||||
- Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
|
||||
|
||||
```
|
||||
python convert-whisper-to-openvino.py --model base.en
|
||||
```
|
||||
|
||||
This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as `ggml` models, as that
|
||||
is the default location that the OpenVINO extension will search at runtime.
|
||||
|
||||
- Build `whisper.cpp` with OpenVINO support:
|
||||
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
|
||||
|
||||
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
|
||||
|
||||
Linux:
|
||||
|
||||
```bash
|
||||
source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
|
||||
```
|
||||
|
||||
Windows (cmd):
|
||||
|
||||
```powershell
|
||||
C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
|
||||
```
|
||||
|
||||
And then build the project using cmake:
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_OPENVINO=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
- Run the examples as usual. For example:
|
||||
|
||||
```text
|
||||
$ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
|
||||
...
|
||||
|
||||
whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
|
||||
whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
|
||||
whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
|
||||
whisper_ctx_init_openvino_encoder: OpenVINO model loaded
|
||||
|
||||
system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
|
||||
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
|
||||
|
||||
Now build `whisper.cpp` with CUDA support:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_CUDA=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## Vulkan GPU support
|
||||
Cross-vendor solution which allows you to accelerate workload on your GPU.
|
||||
First, make sure your graphics card driver provides support for Vulkan API.
|
||||
|
||||
Now build `whisper.cpp` with Vulkan support:
|
||||
```
|
||||
cmake -B build -DGGML_VULKAN=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## BLAS CPU support via OpenBLAS
|
||||
|
||||
Encoder processing can be accelerated on the CPU via OpenBLAS.
|
||||
First, make sure you have installed `openblas`: https://www.openblas.net/
|
||||
|
||||
Now build `whisper.cpp` with OpenBLAS support:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_BLAS=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## Ascend NPU support
|
||||
|
||||
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
|
||||
|
||||
First, check if your Ascend NPU device is supported:
|
||||
|
||||
**Verified devices**
|
||||
| Ascend NPU | Status |
|
||||
|:-----------------------------:|:-------:|
|
||||
| Atlas 300T A2 | Support |
|
||||
|
||||
Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
|
||||
|
||||
Now build `whisper.cpp` with CANN support:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_CANN=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
Run the inference examples as usual, for example:
|
||||
|
||||
```
|
||||
./build/bin/whisper-cli -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
||||
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
|
||||
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
|
||||
|
||||
## Installing with Conan
|
||||
|
||||
You can install pre-built binaries for whisper.cpp or build it from source using [Conan](https://conan.io/). Use the following command:
|
||||
|
||||
```
|
||||
conan install --requires="whisper-cpp/[*]" --build=missing
|
||||
```
|
||||
|
||||
For detailed instructions on how to use Conan, please refer to the [Conan documentation](https://docs.conan.io/2/).
|
||||
|
||||
## Limitations
|
||||
|
||||
- Inference only
|
||||
</details>
|
||||
|
||||
## Real-time audio input example
|
||||
|
||||
This is a naive example of performing real-time inference on audio from your microphone.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
cmake --build build --config Release
|
||||
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
|
||||
```java
|
||||
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
|
||||
```
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
|
||||
@ -393,22 +313,18 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
|
||||
Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
|
||||
to highlight words with high or low confidence:
|
||||
|
||||
```bash
|
||||
./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
|
||||
```
|
||||
|
||||
<img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
|
||||
|
||||
## Controlling the length of the generated text segments (experimental)
|
||||
|
||||
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
|
||||
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
|
||||
|
||||
```text
|
||||
$ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
|
||||
```java
|
||||
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
|
||||
|
||||
whisper_model_load: loading model from './models/ggml-base.en.bin'
|
||||
...
|
||||
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
|
||||
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
|
||||
|
||||
@ -423,20 +339,20 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
|
||||
[00:00:10.020 --> 00:00:11.000] country.
|
||||
```
|
||||
|
||||
## Word-level timestamp (experimental)
|
||||
## Word-level timestamp
|
||||
|
||||
The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
|
||||
|
||||
```text
|
||||
$ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
|
||||
```java
|
||||
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
|
||||
|
||||
whisper_model_load: loading model from './models/ggml-base.en.bin'
|
||||
...
|
||||
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
|
||||
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
|
||||
|
||||
[00:00:00.000 --> 00:00:00.320]
|
||||
[00:00:00.000 --> 00:00:00.320]
|
||||
[00:00:00.320 --> 00:00:00.370] And
|
||||
[00:00:00.370 --> 00:00:00.690] so
|
||||
[00:00:00.690 --> 00:00:00.850] my
|
||||
@ -464,42 +380,16 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
|
||||
[00:00:10.510 --> 00:00:11.000] .
|
||||
```
|
||||
|
||||
## Speaker segmentation via tinydiarize (experimental)
|
||||
|
||||
More information about this approach is available here: https://github.com/ggerganov/whisper.cpp/pull/1058
|
||||
|
||||
Sample usage:
|
||||
|
||||
```py
|
||||
# download a tinydiarize compatible model
|
||||
./models/download-ggml-model.sh small.en-tdrz
|
||||
|
||||
# run as usual, adding the "-tdrz" command-line argument
|
||||
./build/bin/whisper-cli -f ./samples/a13.wav -m ./models/ggml-small.en-tdrz.bin -tdrz
|
||||
...
|
||||
main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, tdrz = 1, timestamps = 1 ...
|
||||
...
|
||||
[00:00:00.000 --> 00:00:03.800] Okay Houston, we've had a problem here. [SPEAKER_TURN]
|
||||
[00:00:03.800 --> 00:00:06.200] This is Houston. Say again please. [SPEAKER_TURN]
|
||||
[00:00:06.200 --> 00:00:08.260] Uh Houston we've had a problem.
|
||||
[00:00:08.260 --> 00:00:11.320] We've had a main beam up on a volt. [SPEAKER_TURN]
|
||||
[00:00:11.320 --> 00:00:13.820] Roger main beam interval. [SPEAKER_TURN]
|
||||
[00:00:13.820 --> 00:00:15.100] Uh uh [SPEAKER_TURN]
|
||||
[00:00:15.100 --> 00:00:18.020] So okay stand, by thirteen we're looking at it. [SPEAKER_TURN]
|
||||
[00:00:18.020 --> 00:00:25.740] Okay uh right now uh Houston the uh voltage is uh is looking good um.
|
||||
[00:00:27.620 --> 00:00:29.940] And we had a a pretty large bank or so.
|
||||
```
|
||||
|
||||
## Karaoke-style movie generation (experimental)
|
||||
|
||||
The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
|
||||
The [main](examples/main) example provides support for output of karaoke-style movies, where the
|
||||
currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
|
||||
This requires to have `ffmpeg` installed.
|
||||
|
||||
Here are a few _"typical"_ examples:
|
||||
Here are a few *"typical"* examples:
|
||||
|
||||
```bash
|
||||
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
|
||||
```java
|
||||
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
|
||||
source ./samples/jfk.wav.wts
|
||||
ffplay ./samples/jfk.wav.mp4
|
||||
```
|
||||
@ -508,8 +398,8 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b
|
||||
|
||||
---
|
||||
|
||||
```bash
|
||||
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
|
||||
```java
|
||||
./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
|
||||
source ./samples/mm0.wav.wts
|
||||
ffplay ./samples/mm0.wav.mp4
|
||||
```
|
||||
@ -518,8 +408,8 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9
|
||||
|
||||
---
|
||||
|
||||
```bash
|
||||
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
|
||||
```java
|
||||
./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
|
||||
source ./samples/gb0.wav.wts
|
||||
ffplay ./samples/gb0.wav.mp4
|
||||
```
|
||||
@ -528,40 +418,15 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
|
||||
|
||||
---
|
||||
|
||||
## Video comparison of different models
|
||||
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
|
||||
```bash
|
||||
./scripts/bench-wts.sh samples/jfk.wav
|
||||
ffplay ./samples/jfk.wav.all.mp4
|
||||
```
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
|
||||
|
||||
---
|
||||
|
||||
## Benchmarks
|
||||
|
||||
In order to have an objective comparison of the performance of the inference across different system configurations,
|
||||
use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
|
||||
use the [bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
|
||||
took to execute it. The results are summarized in the following Github issue:
|
||||
|
||||
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
|
||||
|
||||
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
|
||||
|
||||
You can run it with the following command, by default it will run against any standard model in the models folder.
|
||||
|
||||
```bash
|
||||
python3 scripts/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
|
||||
```
|
||||
|
||||
It is written in python with the intention of being easy to modify and extend for your benchmarking use case.
|
||||
|
||||
It outputs a csv file with the results of the benchmarking.
|
||||
|
||||
## `ggml` format
|
||||
## ggml format
|
||||
|
||||
The original models are converted to a custom binary format. This allows to pack everything needed into a single file:
|
||||
|
||||
@ -573,54 +438,36 @@ The original models are converted to a custom binary format. This allows to pack
|
||||
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
|
||||
or manually from here:
|
||||
|
||||
- https://huggingface.co/ggerganov/whisper.cpp
|
||||
- https://huggingface.co/datasets/ggerganov/whisper.cpp
|
||||
- https://ggml.ggerganov.com
|
||||
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
|
||||
in [models](models).
|
||||
|
||||
## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)
|
||||
## Bindings
|
||||
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
|
||||
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [x] Java:
|
||||
- [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
|
||||
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
|
||||
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
|
||||
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
|
||||
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
|
||||
- [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
|
||||
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
|
||||
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
|
||||
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
|
||||
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
|
||||
- [ ] Python:
|
||||
- [ ] Java:
|
||||
|
||||
## Examples
|
||||
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder.
|
||||
Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
|
||||
|
||||
| Example | Web | Description |
|
||||
| --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
|
||||
| [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
|
||||
| [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
|
||||
| [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
|
||||
| [whisper-server](examples/server) | | HTTP transcription server with OAI-like API |
|
||||
| [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
|
||||
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
|
||||
| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
|
||||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
|
||||
| Example | Web | Description |
|
||||
| --- | --- | --- |
|
||||
| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
|
||||
| [bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
|
||||
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
|
||||
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
|
||||
| | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
|
||||
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
|
||||
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
|
||||
|
||||
|
249
README_sycl.md
249
README_sycl.md
@ -1,249 +0,0 @@
|
||||
# whisper.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<72>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The whisper.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel<65> oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel<65> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute whisper.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_whisper.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|
||||
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
@ -1,5 +0,0 @@
|
||||
module whisper [system] {
|
||||
header "whisper.h"
|
||||
link "whisper"
|
||||
export *
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <whisper.h>
|
||||
|
@ -1,19 +1,3 @@
|
||||
if (EMSCRIPTEN)
|
||||
add_subdirectory(javascript)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/javascript/publish.log
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/whisper.js
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/libwhisper.worker.js
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/package.json
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/javascript
|
||||
COMMAND npm publish
|
||||
COMMAND touch publish.log
|
||||
COMMENT "Publishing npm module v${PROJECT_VERSION}"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(publish-npm
|
||||
DEPENDS javascript/publish.log
|
||||
)
|
||||
endif()
|
||||
|
2
bindings/go/.gitignore
vendored
2
bindings/go/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
build
|
||||
models
|
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022 David Thorpe
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,69 +0,0 @@
|
||||
ifndef UNAME_S
|
||||
UNAME_S := $(shell uname -s)
|
||||
endif
|
||||
|
||||
ifndef UNAME_P
|
||||
UNAME_P := $(shell uname -p)
|
||||
endif
|
||||
|
||||
ifndef UNAME_M
|
||||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
GGML_METAL_PATH_RESOURCES := $(abspath ../..)
|
||||
BUILD_DIR := build
|
||||
MODELS_DIR := models
|
||||
EXAMPLES_DIR := $(wildcard examples/*)
|
||||
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
|
||||
LIBRARY_PATH := $(abspath ../..)
|
||||
|
||||
ifeq ($(GGML_CUDA),1)
|
||||
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
|
||||
BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
|
||||
endif
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
|
||||
endif
|
||||
|
||||
all: clean whisper examples
|
||||
|
||||
whisper: mkdir
|
||||
@echo Build whisper
|
||||
@${MAKE} -C ../.. libwhisper.a
|
||||
|
||||
test: model-small whisper modtidy
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v .
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v ./pkg/whisper/...
|
||||
else
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
|
||||
endif
|
||||
|
||||
examples: $(EXAMPLES_DIR)
|
||||
|
||||
model-small: mkdir examples/go-model-download
|
||||
@${BUILD_DIR}/go-model-download -out models ggml-small.en.bin
|
||||
|
||||
$(EXAMPLES_DIR): mkdir whisper modtidy
|
||||
@echo Build example $(notdir $@)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go build ${BUILD_FLAGS} -ldflags "-extldflags '$(EXT_LDFLAGS)'" -o ${BUILD_DIR}/$(notdir $@) ./$@
|
||||
else
|
||||
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
|
||||
endif
|
||||
|
||||
mkdir:
|
||||
@echo Mkdir ${BUILD_DIR}
|
||||
@install -d ${BUILD_DIR}
|
||||
@echo Mkdir ${MODELS_DIR}
|
||||
@install -d ${MODELS_DIR}
|
||||
|
||||
modtidy:
|
||||
@go mod tidy
|
||||
|
||||
clean:
|
||||
@echo Clean
|
||||
@rm -fr $(BUILD_DIR)
|
||||
@go clean
|
@ -1,106 +0,0 @@
|
||||
# Go bindings for Whisper
|
||||
|
||||
This package provides Go bindings for whisper.cpp. They have been tested on:
|
||||
|
||||
* Darwin (OS X) 12.6 on x64_64
|
||||
* Debian Linux on arm64
|
||||
* Fedora Linux on x86_64
|
||||
|
||||
The "low level" bindings are in the `bindings/go` directory and there is a more
|
||||
Go-style package in the `bindings/go/pkg/whisper` directory. The most simple usage
|
||||
is as follows:
|
||||
|
||||
```go
|
||||
import (
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var modelpath string // Path to the model
|
||||
var samples []float32 // Samples to process
|
||||
|
||||
// Load the model
|
||||
model, err := whisper.New(modelpath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer model.Close()
|
||||
|
||||
// Process samples
|
||||
context, err := model.NewContext()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := context.Process(samples, nil, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Print out the results
|
||||
for {
|
||||
segment, err := context.NextSegment()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
fmt.Printf("[%6s->%6s] %s\n", segment.Start, segment.End, segment.Text)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Building & Testing
|
||||
|
||||
In order to build, you need to have the Go compiler installed. You can get it from [here](https://golang.org/dl/). Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/go
|
||||
make test
|
||||
```
|
||||
|
||||
This will compile a static `libwhisper.a` in a `build` folder, download a model file, then run the tests. To build the examples:
|
||||
|
||||
```bash
|
||||
make examples
|
||||
```
|
||||
|
||||
To build using cuda support add `GGML_CUDA=1`:
|
||||
|
||||
```bash
|
||||
GGML_CUDA=1 make examples
|
||||
```
|
||||
|
||||
The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
|
||||
|
||||
```bash
|
||||
./build/go-model-download -out models
|
||||
```
|
||||
|
||||
And you can then test a model against samples with the following command:
|
||||
|
||||
```bash
|
||||
./build/go-whisper -model models/ggml-tiny.en.bin samples/jfk.wav
|
||||
```
|
||||
|
||||
## Using the bindings
|
||||
|
||||
To use the bindings in your own software,
|
||||
|
||||
1. Import `github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper` (or `github.com/ggerganov/whisper.cpp/bindings/go` into your package;
|
||||
2. Compile `libwhisper.a` (you can use `make whisper` in the `bindings/go` directory);
|
||||
3. Link your go binary against whisper by setting the environment variables `C_INCLUDE_PATH` and `LIBRARY_PATH`
|
||||
to point to the `whisper.h` file directory and `libwhisper.a` file directory respectively.
|
||||
|
||||
Look at the `Makefile` in the `bindings/go` directory for an example.
|
||||
|
||||
The API Documentation:
|
||||
|
||||
* https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go
|
||||
* https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper
|
||||
|
||||
Getting help:
|
||||
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
|
||||
## License
|
||||
|
||||
The license for the Go bindings is the same as the license for the rest of the whisper.cpp project, which is the MIT License. See the `LICENSE` file for more details.
|
||||
|
@ -1,5 +0,0 @@
|
||||
/*
|
||||
github.com/ggerganov/whisper.cpp/bindings/go
|
||||
provides a speech-to-text service bindings for the Go programming language.
|
||||
*/
|
||||
package whisper
|
@ -1,30 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"os/signal"
|
||||
)
|
||||
|
||||
// ContextForSignal returns a context object which is cancelled when a signal
|
||||
// is received. It returns nil if no signal parameter is provided
|
||||
func ContextForSignal(signals ...os.Signal) context.Context {
|
||||
if len(signals) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ch := make(chan os.Signal)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Send message on channel when signal received
|
||||
signal.Notify(ch, signals...)
|
||||
|
||||
// When any signal received, call cancel
|
||||
go func() {
|
||||
<-ch
|
||||
cancel()
|
||||
}()
|
||||
|
||||
// Return success
|
||||
return ctx
|
||||
}
|
@ -1,208 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
const (
|
||||
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
|
||||
srcExt = ".bin" // Filename extension
|
||||
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
|
||||
)
|
||||
|
||||
var (
|
||||
// The models which will be downloaded, if no model is specified as an argument
|
||||
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
|
||||
)
|
||||
|
||||
var (
|
||||
// The output folder. When not set, use current working directory.
|
||||
flagOut = flag.String("out", "", "Output folder")
|
||||
|
||||
// HTTP timeout parameter - will timeout if takes longer than this to download a model
|
||||
flagTimeout = flag.Duration("timeout", 30*time.Minute, "HTTP timeout")
|
||||
|
||||
// Quiet parameter - will not print progress if set
|
||||
flagQuiet = flag.Bool("quiet", false, "Quiet mode")
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// MAIN
|
||||
|
||||
func main() {
|
||||
flag.Usage = func() {
|
||||
name := filepath.Base(flag.CommandLine.Name())
|
||||
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [options] <model>\n\n", name)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
// Get output path
|
||||
out, err := GetOut()
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
// Create context which quits on SIGINT or SIGQUIT
|
||||
ctx := ContextForSignal(os.Interrupt, syscall.SIGQUIT)
|
||||
|
||||
// Progress filehandle
|
||||
progress := os.Stdout
|
||||
if *flagQuiet {
|
||||
progress, err = os.Open(os.DevNull)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
os.Exit(-1)
|
||||
}
|
||||
defer progress.Close()
|
||||
}
|
||||
|
||||
// Download models - exit on error or interrupt
|
||||
for _, model := range GetModels() {
|
||||
url, err := URLForModel(model)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
continue
|
||||
} else if path, err := Download(ctx, progress, url, out); err == nil || err == io.EOF {
|
||||
continue
|
||||
} else if err == context.Canceled {
|
||||
os.Remove(path)
|
||||
fmt.Fprintln(progress, "\nInterrupted")
|
||||
break
|
||||
} else if err == context.DeadlineExceeded {
|
||||
os.Remove(path)
|
||||
fmt.Fprintln(progress, "Timeout downloading model")
|
||||
continue
|
||||
} else {
|
||||
os.Remove(path)
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
// GetOut returns the path to the output directory
|
||||
func GetOut() (string, error) {
|
||||
if *flagOut == "" {
|
||||
return os.Getwd()
|
||||
}
|
||||
if info, err := os.Stat(*flagOut); err != nil {
|
||||
return "", err
|
||||
} else if !info.IsDir() {
|
||||
return "", fmt.Errorf("not a directory: %s", info.Name())
|
||||
} else {
|
||||
return *flagOut, nil
|
||||
}
|
||||
}
|
||||
|
||||
// GetModels returns the list of models to download
|
||||
func GetModels() []string {
|
||||
if flag.NArg() == 0 {
|
||||
return modelNames
|
||||
} else {
|
||||
return flag.Args()
|
||||
}
|
||||
}
|
||||
|
||||
// URLForModel returns the URL for the given model on huggingface.co
|
||||
func URLForModel(model string) (string, error) {
|
||||
if filepath.Ext(model) != srcExt {
|
||||
model += srcExt
|
||||
}
|
||||
url, err := url.Parse(srcUrl)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else {
|
||||
url.Path = filepath.Join(url.Path, model)
|
||||
}
|
||||
return url.String(), nil
|
||||
}
|
||||
|
||||
// Download downloads the model from the given URL to the given output directory
|
||||
func Download(ctx context.Context, p io.Writer, model, out string) (string, error) {
|
||||
// Create HTTP client
|
||||
client := http.Client{
|
||||
Timeout: *flagTimeout,
|
||||
}
|
||||
|
||||
// Initiate the download
|
||||
req, err := http.NewRequest("GET", model, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("%s: %s", model, resp.Status)
|
||||
}
|
||||
|
||||
// If output file exists and is the same size as the model, skip
|
||||
path := filepath.Join(out, filepath.Base(model))
|
||||
if info, err := os.Stat(path); err == nil && info.Size() == resp.ContentLength {
|
||||
fmt.Fprintln(p, "Skipping", model, "as it already exists")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// Create file
|
||||
w, err := os.Create(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer w.Close()
|
||||
|
||||
// Report
|
||||
fmt.Fprintln(p, "Downloading", model, "to", out)
|
||||
|
||||
// Progressively download the model
|
||||
data := make([]byte, bufSize)
|
||||
count, pct := int64(0), int64(0)
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// Cancelled, return error
|
||||
return path, ctx.Err()
|
||||
case <-ticker.C:
|
||||
pct = DownloadReport(p, pct, count, resp.ContentLength)
|
||||
default:
|
||||
// Read body
|
||||
n, err := resp.Body.Read(data)
|
||||
if err != nil {
|
||||
DownloadReport(p, pct, count, resp.ContentLength)
|
||||
return path, err
|
||||
} else if m, err := w.Write(data[:n]); err != nil {
|
||||
return path, err
|
||||
} else {
|
||||
count += int64(m)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Report periodically reports the download progress when percentage changes
|
||||
func DownloadReport(w io.Writer, pct, count, total int64) int64 {
|
||||
pct_ := count * 100 / total
|
||||
if pct_ > pct {
|
||||
fmt.Fprintf(w, " ...%d MB written (%d%%)\n", count/1e6, pct_)
|
||||
}
|
||||
return pct_
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
const (
|
||||
Reset = "\033[0m"
|
||||
RGBPrefix = "\033[38;5;" // followed by RGB values in decimal format separated by colons
|
||||
RGBSuffix = "m"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
// Colorize text with RGB values, from 0 to 23
|
||||
func Colorize(text string, v int) string {
|
||||
// https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
|
||||
// Grayscale colors are in the range 232-255
|
||||
return RGBPrefix + fmt.Sprint(v%24+232) + RGBSuffix + text + Reset
|
||||
}
|
@ -1,147 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
// Packages
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TYPES
|
||||
|
||||
type Flags struct {
|
||||
*flag.FlagSet
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// LIFECYCLE
|
||||
|
||||
func NewFlags(name string, args []string) (*Flags, error) {
|
||||
flags := &Flags{
|
||||
FlagSet: flag.NewFlagSet(name, flag.ContinueOnError),
|
||||
}
|
||||
|
||||
// Register the command line arguments
|
||||
registerFlags(flags)
|
||||
|
||||
// Parse command line
|
||||
if err := flags.Parse(args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Return success
|
||||
return flags, nil
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
func (flags *Flags) GetModel() string {
|
||||
return flags.Lookup("model").Value.String()
|
||||
}
|
||||
|
||||
func (flags *Flags) GetLanguage() string {
|
||||
return flags.Lookup("language").Value.String()
|
||||
}
|
||||
|
||||
func (flags *Flags) IsTranslate() bool {
|
||||
return flags.Lookup("translate").Value.(flag.Getter).Get().(bool)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetOffset() time.Duration {
|
||||
return flags.Lookup("offset").Value.(flag.Getter).Get().(time.Duration)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetDuration() time.Duration {
|
||||
return flags.Lookup("duration").Value.(flag.Getter).Get().(time.Duration)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetThreads() uint {
|
||||
return flags.Lookup("threads").Value.(flag.Getter).Get().(uint)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetOut() string {
|
||||
return strings.ToLower(flags.Lookup("out").Value.String())
|
||||
}
|
||||
|
||||
func (flags *Flags) IsTokens() bool {
|
||||
return flags.Lookup("tokens").Value.String() == "true"
|
||||
}
|
||||
|
||||
func (flags *Flags) IsColorize() bool {
|
||||
return flags.Lookup("colorize").Value.String() == "true"
|
||||
}
|
||||
|
||||
func (flags *Flags) GetMaxLen() uint {
|
||||
return flags.Lookup("max-len").Value.(flag.Getter).Get().(uint)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetMaxTokens() uint {
|
||||
return flags.Lookup("max-tokens").Value.(flag.Getter).Get().(uint)
|
||||
}
|
||||
|
||||
func (flags *Flags) GetWordThreshold() float32 {
|
||||
return float32(flags.Lookup("word-thold").Value.(flag.Getter).Get().(float64))
|
||||
}
|
||||
|
||||
func (flags *Flags) SetParams(context whisper.Context) error {
|
||||
if lang := flags.GetLanguage(); lang != "" && lang != "auto" {
|
||||
fmt.Fprintf(flags.Output(), "Setting language to %q\n", lang)
|
||||
if err := context.SetLanguage(lang); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if flags.IsTranslate() && context.IsMultilingual() {
|
||||
fmt.Fprintf(flags.Output(), "Setting translate to true\n")
|
||||
context.SetTranslate(true)
|
||||
}
|
||||
if offset := flags.GetOffset(); offset != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting offset to %v\n", offset)
|
||||
context.SetOffset(offset)
|
||||
}
|
||||
if duration := flags.GetDuration(); duration != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
|
||||
context.SetDuration(duration)
|
||||
}
|
||||
if threads := flags.GetThreads(); threads != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
|
||||
context.SetThreads(threads)
|
||||
}
|
||||
if max_len := flags.GetMaxLen(); max_len != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting max_segment_length to %d\n", max_len)
|
||||
context.SetMaxSegmentLength(max_len)
|
||||
}
|
||||
if max_tokens := flags.GetMaxTokens(); max_tokens != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting max_tokens to %d\n", max_tokens)
|
||||
context.SetMaxTokensPerSegment(max_tokens)
|
||||
}
|
||||
if word_threshold := flags.GetWordThreshold(); word_threshold != 0 {
|
||||
fmt.Fprintf(flags.Output(), "Setting word_threshold to %f\n", word_threshold)
|
||||
context.SetTokenThreshold(word_threshold)
|
||||
}
|
||||
|
||||
// Return success
|
||||
return nil
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PRIVATE METHODS
|
||||
|
||||
func registerFlags(flag *Flags) {
|
||||
flag.String("model", "", "Path to the model file")
|
||||
flag.String("language", "", "Spoken language")
|
||||
flag.Bool("translate", false, "Translate from source language to english")
|
||||
flag.Duration("offset", 0, "Time offset")
|
||||
flag.Duration("duration", 0, "Duration of audio to process")
|
||||
flag.Uint("threads", 0, "Number of threads to use")
|
||||
flag.Uint("max-len", 0, "Maximum segment length in characters")
|
||||
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
|
||||
flag.Float64("word-thold", 0, "Maximum segment score")
|
||||
flag.Bool("tokens", false, "Display tokens")
|
||||
flag.Bool("colorize", false, "Colorize tokens")
|
||||
flag.String("out", "", "Output format (srt, none or leave as empty string)")
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
// Packages
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
)
|
||||
|
||||
func main() {
|
||||
flags, err := NewFlags(filepath.Base(os.Args[0]), os.Args[1:])
|
||||
if err == flag.ErrHelp {
|
||||
os.Exit(0)
|
||||
} else if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
} else if flags.GetModel() == "" {
|
||||
fmt.Fprintln(os.Stderr, "Use -model flag to specify which model file to use")
|
||||
os.Exit(1)
|
||||
} else if flags.NArg() == 0 {
|
||||
fmt.Fprintln(os.Stderr, "No input files specified")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Load model
|
||||
model, err := whisper.New(flags.GetModel())
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer model.Close()
|
||||
|
||||
// Process files
|
||||
for _, filename := range flags.Args() {
|
||||
if err := Process(model, filename, flags); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
@ -1,132 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
// Package imports
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
wav "github.com/go-audio/wav"
|
||||
)
|
||||
|
||||
func Process(model whisper.Model, path string, flags *Flags) error {
|
||||
var data []float32
|
||||
|
||||
// Create processing context
|
||||
context, err := model.NewContext()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Set the parameters
|
||||
if err := flags.SetParams(context); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("\n%s\n", context.SystemInfo())
|
||||
|
||||
// Open the file
|
||||
fmt.Fprintf(flags.Output(), "Loading %q\n", path)
|
||||
fh, err := os.Open(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fh.Close()
|
||||
|
||||
// Decode the WAV file - load the full buffer
|
||||
dec := wav.NewDecoder(fh)
|
||||
if buf, err := dec.FullPCMBuffer(); err != nil {
|
||||
return err
|
||||
} else if dec.SampleRate != whisper.SampleRate {
|
||||
return fmt.Errorf("unsupported sample rate: %d", dec.SampleRate)
|
||||
} else if dec.NumChans != 1 {
|
||||
return fmt.Errorf("unsupported number of channels: %d", dec.NumChans)
|
||||
} else {
|
||||
data = buf.AsFloat32Buffer().Data
|
||||
}
|
||||
|
||||
// Segment callback when -tokens is specified
|
||||
var cb whisper.SegmentCallback
|
||||
if flags.IsTokens() {
|
||||
cb = func(segment whisper.Segment) {
|
||||
fmt.Fprintf(flags.Output(), "%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
|
||||
for _, token := range segment.Tokens {
|
||||
if flags.IsColorize() && context.IsText(token) {
|
||||
fmt.Fprint(flags.Output(), Colorize(token.Text, int(token.P*24.0)), " ")
|
||||
} else {
|
||||
fmt.Fprint(flags.Output(), token.Text, " ")
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(flags.Output(), "")
|
||||
fmt.Fprintln(flags.Output(), "")
|
||||
}
|
||||
}
|
||||
|
||||
// Process the data
|
||||
fmt.Fprintf(flags.Output(), " ...processing %q\n", path)
|
||||
context.ResetTimings()
|
||||
if err := context.Process(data, cb, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
context.PrintTimings()
|
||||
|
||||
// Print out the results
|
||||
switch {
|
||||
case flags.GetOut() == "srt":
|
||||
return OutputSRT(os.Stdout, context)
|
||||
case flags.GetOut() == "none":
|
||||
return nil
|
||||
default:
|
||||
return Output(os.Stdout, context, flags.IsColorize())
|
||||
}
|
||||
}
|
||||
|
||||
// Output text as SRT file
|
||||
func OutputSRT(w io.Writer, context whisper.Context) error {
|
||||
n := 1
|
||||
for {
|
||||
segment, err := context.NextSegment()
|
||||
if err == io.EOF {
|
||||
return nil
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(w, n)
|
||||
fmt.Fprintln(w, srtTimestamp(segment.Start), " --> ", srtTimestamp(segment.End))
|
||||
fmt.Fprintln(w, segment.Text)
|
||||
fmt.Fprintln(w, "")
|
||||
n++
|
||||
}
|
||||
}
|
||||
|
||||
// Output text to terminal
|
||||
func Output(w io.Writer, context whisper.Context, colorize bool) error {
|
||||
for {
|
||||
segment, err := context.NextSegment()
|
||||
if err == io.EOF {
|
||||
return nil
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintf(w, "[%6s->%6s]", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
|
||||
if colorize {
|
||||
for _, token := range segment.Tokens {
|
||||
if !context.IsText(token) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprint(w, " ", Colorize(token.Text, int(token.P*24.0)))
|
||||
}
|
||||
fmt.Fprint(w, "\n")
|
||||
} else {
|
||||
fmt.Fprintln(w, " ", segment.Text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return srtTimestamp
|
||||
func srtTimestamp(t time.Duration) string {
|
||||
return fmt.Sprintf("%02d:%02d:%02d,%03d", t/time.Hour, (t%time.Hour)/time.Minute, (t%time.Minute)/time.Second, (t%time.Second)/time.Millisecond)
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
module github.com/ggerganov/whisper.cpp/bindings/go
|
||||
|
||||
go 1.23
|
||||
|
||||
require (
|
||||
github.com/go-audio/wav v1.1.0
|
||||
github.com/stretchr/testify v1.9.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/go-audio/audio v1.0.0 // indirect
|
||||
github.com/go-audio/riff v1.0.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
@ -1,16 +0,0 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
|
||||
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
|
||||
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
|
||||
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
|
||||
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
|
||||
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
@ -1,204 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CGO
|
||||
|
||||
/*
|
||||
#include <whisper.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
func (p *Params) SetTranslate(v bool) {
|
||||
p.translate = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetSplitOnWord(v bool) {
|
||||
p.split_on_word = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetNoContext(v bool) {
|
||||
p.no_context = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetSingleSegment(v bool) {
|
||||
p.single_segment = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetPrintSpecial(v bool) {
|
||||
p.print_special = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetPrintProgress(v bool) {
|
||||
p.print_progress = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetPrintRealtime(v bool) {
|
||||
p.print_realtime = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetPrintTimestamps(v bool) {
|
||||
p.print_timestamps = toBool(v)
|
||||
}
|
||||
|
||||
// Set language id
|
||||
func (p *Params) SetLanguage(lang int) error {
|
||||
if lang == -1 {
|
||||
p.language = nil
|
||||
return nil
|
||||
}
|
||||
str := C.whisper_lang_str(C.int(lang))
|
||||
if str == nil {
|
||||
return ErrInvalidLanguage
|
||||
} else {
|
||||
p.language = str
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get language id
|
||||
func (p *Params) Language() int {
|
||||
if p.language == nil {
|
||||
return -1
|
||||
}
|
||||
return int(C.whisper_lang_id(p.language))
|
||||
}
|
||||
|
||||
// Threads available
|
||||
func (p *Params) Threads() int {
|
||||
return int(p.n_threads)
|
||||
}
|
||||
|
||||
// Set number of threads to use
|
||||
func (p *Params) SetThreads(threads int) {
|
||||
p.n_threads = C.int(threads)
|
||||
}
|
||||
|
||||
// Set start offset in ms
|
||||
func (p *Params) SetOffset(offset_ms int) {
|
||||
p.offset_ms = C.int(offset_ms)
|
||||
}
|
||||
|
||||
// Set audio duration to process in ms
|
||||
func (p *Params) SetDuration(duration_ms int) {
|
||||
p.duration_ms = C.int(duration_ms)
|
||||
}
|
||||
|
||||
// Set timestamp token probability threshold (~0.01)
|
||||
func (p *Params) SetTokenThreshold(t float32) {
|
||||
p.thold_pt = C.float(t)
|
||||
}
|
||||
|
||||
// Set timestamp token sum probability threshold (~0.01)
|
||||
func (p *Params) SetTokenSumThreshold(t float32) {
|
||||
p.thold_ptsum = C.float(t)
|
||||
}
|
||||
|
||||
// Set max segment length in characters
|
||||
func (p *Params) SetMaxSegmentLength(n int) {
|
||||
p.max_len = C.int(n)
|
||||
}
|
||||
|
||||
func (p *Params) SetTokenTimestamps(b bool) {
|
||||
p.token_timestamps = toBool(b)
|
||||
}
|
||||
|
||||
// Set max tokens per segment (0 = no limit)
|
||||
func (p *Params) SetMaxTokensPerSegment(n int) {
|
||||
p.max_tokens = C.int(n)
|
||||
}
|
||||
|
||||
// Set audio encoder context
|
||||
func (p *Params) SetAudioCtx(n int) {
|
||||
p.audio_ctx = C.int(n)
|
||||
}
|
||||
|
||||
func (p *Params) SetMaxContext(n int) {
|
||||
p.n_max_text_ctx = C.int(n)
|
||||
}
|
||||
|
||||
func (p *Params) SetBeamSize(n int) {
|
||||
p.beam_search.beam_size = C.int(n)
|
||||
}
|
||||
|
||||
func (p *Params) SetEntropyThold(t float32) {
|
||||
p.entropy_thold = C.float(t)
|
||||
}
|
||||
|
||||
func (p *Params) SetTemperature(t float32) {
|
||||
p.temperature = C.float(t)
|
||||
}
|
||||
|
||||
// Sets the fallback temperature incrementation
|
||||
// Pass -1.0 to disable this feature
|
||||
func (p *Params) SetTemperatureFallback(t float32) {
|
||||
p.temperature_inc = C.float(t)
|
||||
}
|
||||
|
||||
// Set initial prompt
|
||||
func (p *Params) SetInitialPrompt(prompt string) {
|
||||
p.initial_prompt = C.CString(prompt)
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PRIVATE METHODS
|
||||
|
||||
func toBool(v bool) C.bool {
|
||||
if v {
|
||||
return C.bool(true)
|
||||
}
|
||||
return C.bool(false)
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// STRINGIFY
|
||||
|
||||
func (p *Params) String() string {
|
||||
str := "<whisper.params"
|
||||
str += fmt.Sprintf(" strategy=%v", p.strategy)
|
||||
str += fmt.Sprintf(" n_threads=%d", p.n_threads)
|
||||
if p.language != nil {
|
||||
str += fmt.Sprintf(" language=%s", C.GoString(p.language))
|
||||
}
|
||||
str += fmt.Sprintf(" n_max_text_ctx=%d", p.n_max_text_ctx)
|
||||
str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
|
||||
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
|
||||
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
|
||||
str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
|
||||
str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
|
||||
str += fmt.Sprintf(" temperature=%f", p.temperature)
|
||||
str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
|
||||
str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
|
||||
if p.translate {
|
||||
str += " translate"
|
||||
}
|
||||
if p.no_context {
|
||||
str += " no_context"
|
||||
}
|
||||
if p.single_segment {
|
||||
str += " single_segment"
|
||||
}
|
||||
if p.print_special {
|
||||
str += " print_special"
|
||||
}
|
||||
if p.print_progress {
|
||||
str += " print_progress"
|
||||
}
|
||||
if p.print_realtime {
|
||||
str += " print_realtime"
|
||||
}
|
||||
if p.print_timestamps {
|
||||
str += " print_timestamps"
|
||||
}
|
||||
if p.token_timestamps {
|
||||
str += " token_timestamps"
|
||||
}
|
||||
|
||||
return str + ">"
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
// Bindings
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// ERRORS
|
||||
|
||||
var (
|
||||
ErrUnableToLoadModel = errors.New("unable to load model")
|
||||
ErrInternalAppError = errors.New("internal application error")
|
||||
ErrProcessingFailed = errors.New("processing failed")
|
||||
ErrUnsupportedLanguage = errors.New("unsupported language")
|
||||
ErrModelNotMultilingual = errors.New("model is not multilingual")
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
// SampleRate is the sample rate of the audio data.
|
||||
const SampleRate = whisper.SampleRate
|
||||
|
||||
// SampleBits is the number of bytes per sample.
|
||||
const SampleBits = whisper.SampleBits
|
@ -1,342 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
// Bindings
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TYPES
|
||||
|
||||
type context struct {
|
||||
n int
|
||||
model *model
|
||||
params whisper.Params
|
||||
}
|
||||
|
||||
// Make sure context adheres to the interface
|
||||
var _ Context = (*context)(nil)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// LIFECYCLE
|
||||
|
||||
func newContext(model *model, params whisper.Params) (Context, error) {
|
||||
context := new(context)
|
||||
context.model = model
|
||||
context.params = params
|
||||
|
||||
// Return success
|
||||
return context, nil
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
// Set the language to use for speech recognition.
|
||||
func (context *context) SetLanguage(lang string) error {
|
||||
if context.model.ctx == nil {
|
||||
return ErrInternalAppError
|
||||
}
|
||||
if !context.model.IsMultilingual() {
|
||||
return ErrModelNotMultilingual
|
||||
}
|
||||
|
||||
if lang == "auto" {
|
||||
context.params.SetLanguage(-1)
|
||||
} else if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
|
||||
return ErrUnsupportedLanguage
|
||||
} else if err := context.params.SetLanguage(id); err != nil {
|
||||
return err
|
||||
}
|
||||
// Return success
|
||||
return nil
|
||||
}
|
||||
|
||||
func (context *context) IsMultilingual() bool {
|
||||
return context.model.IsMultilingual()
|
||||
}
|
||||
|
||||
// Get language
|
||||
func (context *context) Language() string {
|
||||
id := context.params.Language()
|
||||
if id == -1 {
|
||||
return "auto"
|
||||
}
|
||||
return whisper.Whisper_lang_str(context.params.Language())
|
||||
}
|
||||
|
||||
// Set translate flag
|
||||
func (context *context) SetTranslate(v bool) {
|
||||
context.params.SetTranslate(v)
|
||||
}
|
||||
|
||||
func (context *context) SetSplitOnWord(v bool) {
|
||||
context.params.SetSplitOnWord(v)
|
||||
}
|
||||
|
||||
// Set number of threads to use
|
||||
func (context *context) SetThreads(v uint) {
|
||||
context.params.SetThreads(int(v))
|
||||
}
|
||||
|
||||
// Set time offset
|
||||
func (context *context) SetOffset(v time.Duration) {
|
||||
context.params.SetOffset(int(v.Milliseconds()))
|
||||
}
|
||||
|
||||
// Set duration of audio to process
|
||||
func (context *context) SetDuration(v time.Duration) {
|
||||
context.params.SetDuration(int(v.Milliseconds()))
|
||||
}
|
||||
|
||||
// Set timestamp token probability threshold (~0.01)
|
||||
func (context *context) SetTokenThreshold(t float32) {
|
||||
context.params.SetTokenThreshold(t)
|
||||
}
|
||||
|
||||
// Set timestamp token sum probability threshold (~0.01)
|
||||
func (context *context) SetTokenSumThreshold(t float32) {
|
||||
context.params.SetTokenSumThreshold(t)
|
||||
}
|
||||
|
||||
// Set max segment length in characters
|
||||
func (context *context) SetMaxSegmentLength(n uint) {
|
||||
context.params.SetMaxSegmentLength(int(n))
|
||||
}
|
||||
|
||||
// Set token timestamps flag
|
||||
func (context *context) SetTokenTimestamps(b bool) {
|
||||
context.params.SetTokenTimestamps(b)
|
||||
}
|
||||
|
||||
// Set max tokens per segment (0 = no limit)
|
||||
func (context *context) SetMaxTokensPerSegment(n uint) {
|
||||
context.params.SetMaxTokensPerSegment(int(n))
|
||||
}
|
||||
|
||||
// Set audio encoder context
|
||||
func (context *context) SetAudioCtx(n uint) {
|
||||
context.params.SetAudioCtx(int(n))
|
||||
}
|
||||
|
||||
// Set maximum number of text context tokens to store
|
||||
func (context *context) SetMaxContext(n int) {
|
||||
context.params.SetMaxContext(n)
|
||||
}
|
||||
|
||||
// Set Beam Size
|
||||
func (context *context) SetBeamSize(n int) {
|
||||
context.params.SetBeamSize(n)
|
||||
}
|
||||
|
||||
// Set Entropy threshold
|
||||
func (context *context) SetEntropyThold(t float32) {
|
||||
context.params.SetEntropyThold(t)
|
||||
}
|
||||
|
||||
// Set Temperature
|
||||
func (context *context) SetTemperature(t float32) {
|
||||
context.params.SetTemperature(t)
|
||||
}
|
||||
|
||||
// Set the fallback temperature incrementation
|
||||
// Pass -1.0 to disable this feature
|
||||
func (context *context) SetTemperatureFallback(t float32) {
|
||||
context.params.SetTemperatureFallback(t)
|
||||
}
|
||||
|
||||
// Set initial prompt
|
||||
func (context *context) SetInitialPrompt(prompt string) {
|
||||
context.params.SetInitialPrompt(prompt)
|
||||
}
|
||||
|
||||
// ResetTimings resets the mode timings. Should be called before processing
|
||||
func (context *context) ResetTimings() {
|
||||
context.model.ctx.Whisper_reset_timings()
|
||||
}
|
||||
|
||||
// PrintTimings prints the model timings to stdout.
|
||||
func (context *context) PrintTimings() {
|
||||
context.model.ctx.Whisper_print_timings()
|
||||
}
|
||||
|
||||
// SystemInfo returns the system information
|
||||
func (context *context) SystemInfo() string {
|
||||
return fmt.Sprintf("system_info: n_threads = %d / %d | %s\n",
|
||||
context.params.Threads(),
|
||||
runtime.NumCPU(),
|
||||
whisper.Whisper_print_system_info(),
|
||||
)
|
||||
}
|
||||
|
||||
// Use mel data at offset_ms to try and auto-detect the spoken language
|
||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||
// Returns the probabilities of all languages.
|
||||
func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]float32, error) {
|
||||
langProbs, err := context.model.ctx.Whisper_lang_auto_detect(offset_ms, n_threads)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return langProbs, nil
|
||||
}
|
||||
|
||||
// Process new sample data and return any errors
|
||||
func (context *context) Process(
|
||||
data []float32,
|
||||
callNewSegment SegmentCallback,
|
||||
callProgress ProgressCallback,
|
||||
) error {
|
||||
if context.model.ctx == nil {
|
||||
return ErrInternalAppError
|
||||
}
|
||||
// If the callback is defined then we force on single_segment mode
|
||||
if callNewSegment != nil {
|
||||
context.params.SetSingleSegment(true)
|
||||
}
|
||||
|
||||
// We don't do parallel processing at the moment
|
||||
processors := 0
|
||||
if processors > 1 {
|
||||
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}, func(progress int) {
|
||||
if callProgress != nil {
|
||||
callProgress(progress)
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Return success
|
||||
return nil
|
||||
}
|
||||
|
||||
// Return the next segment of tokens
|
||||
func (context *context) NextSegment() (Segment, error) {
|
||||
if context.model.ctx == nil {
|
||||
return Segment{}, ErrInternalAppError
|
||||
}
|
||||
if context.n >= context.model.ctx.Whisper_full_n_segments() {
|
||||
return Segment{}, io.EOF
|
||||
}
|
||||
|
||||
// Populate result
|
||||
result := toSegment(context.model.ctx, context.n)
|
||||
|
||||
// Increment the cursor
|
||||
context.n++
|
||||
|
||||
// Return success
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Test for text tokens
|
||||
func (context *context) IsText(t Token) bool {
|
||||
switch {
|
||||
case context.IsBEG(t):
|
||||
return false
|
||||
case context.IsSOT(t):
|
||||
return false
|
||||
case whisper.Token(t.Id) >= context.model.ctx.Whisper_token_eot():
|
||||
return false
|
||||
case context.IsPREV(t):
|
||||
return false
|
||||
case context.IsSOLM(t):
|
||||
return false
|
||||
case context.IsNOT(t):
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Test for "begin" token
|
||||
func (context *context) IsBEG(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_beg()
|
||||
}
|
||||
|
||||
// Test for "start of transcription" token
|
||||
func (context *context) IsSOT(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_sot()
|
||||
}
|
||||
|
||||
// Test for "end of transcription" token
|
||||
func (context *context) IsEOT(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_eot()
|
||||
}
|
||||
|
||||
// Test for "start of prev" token
|
||||
func (context *context) IsPREV(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_prev()
|
||||
}
|
||||
|
||||
// Test for "start of lm" token
|
||||
func (context *context) IsSOLM(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_solm()
|
||||
}
|
||||
|
||||
// Test for "No timestamps" token
|
||||
func (context *context) IsNOT(t Token) bool {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_not()
|
||||
}
|
||||
|
||||
// Test for token associated with a specific language
|
||||
func (context *context) IsLANG(t Token, lang string) bool {
|
||||
if id := context.model.ctx.Whisper_lang_id(lang); id >= 0 {
|
||||
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_lang(id)
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PRIVATE METHODS
|
||||
|
||||
func toSegment(ctx *whisper.Context, n int) Segment {
|
||||
return Segment{
|
||||
Num: n,
|
||||
Text: strings.TrimSpace(ctx.Whisper_full_get_segment_text(n)),
|
||||
Start: time.Duration(ctx.Whisper_full_get_segment_t0(n)) * time.Millisecond * 10,
|
||||
End: time.Duration(ctx.Whisper_full_get_segment_t1(n)) * time.Millisecond * 10,
|
||||
Tokens: toTokens(ctx, n),
|
||||
}
|
||||
}
|
||||
|
||||
func toTokens(ctx *whisper.Context, n int) []Token {
|
||||
result := make([]Token, ctx.Whisper_full_n_tokens(n))
|
||||
for i := 0; i < len(result); i++ {
|
||||
data := ctx.Whisper_full_get_token_data(n, i)
|
||||
|
||||
result[i] = Token{
|
||||
Id: int(ctx.Whisper_full_get_token_id(n, i)),
|
||||
Text: ctx.Whisper_full_get_token_text(n, i),
|
||||
P: ctx.Whisper_full_get_token_p(n, i),
|
||||
Start: time.Duration(data.T0()) * time.Millisecond * 10,
|
||||
End: time.Duration(data.T1()) * time.Millisecond * 10,
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package whisper_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
"github.com/go-audio/wav"
|
||||
assert "github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestSetLanguage(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
// This returns an error since
|
||||
// the model 'models/ggml-small.en.bin'
|
||||
// that is loaded is not multilingual
|
||||
err = context.SetLanguage("en")
|
||||
assert.Error(err)
|
||||
}
|
||||
|
||||
func TestContextModelIsMultilingual(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
isMultilingual := context.IsMultilingual()
|
||||
|
||||
// This returns false since
|
||||
// the model 'models/ggml-small.en.bin'
|
||||
// that is loaded is not multilingual
|
||||
assert.False(isMultilingual)
|
||||
}
|
||||
|
||||
func TestLanguage(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
// This always returns en since
|
||||
// the model 'models/ggml-small.en.bin'
|
||||
// that is loaded is not multilingual
|
||||
expectedLanguage := "en"
|
||||
actualLanguage := context.Language()
|
||||
assert.Equal(expectedLanguage, actualLanguage)
|
||||
}
|
||||
|
||||
func TestProcess(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
fh, err := os.Open(SamplePath)
|
||||
assert.NoError(err)
|
||||
defer fh.Close()
|
||||
|
||||
// Decode the WAV file - load the full buffer
|
||||
dec := wav.NewDecoder(fh)
|
||||
buf, err := dec.FullPCMBuffer()
|
||||
assert.NoError(err)
|
||||
assert.Equal(uint16(1), dec.NumChans)
|
||||
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
err = context.Process(data, nil, nil)
|
||||
assert.NoError(err)
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
/*
|
||||
This is the higher-level speech-to-text whisper.cpp API for go
|
||||
*/
|
||||
package whisper
|
@ -1,104 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"io"
|
||||
"time"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TYPES
|
||||
|
||||
// SegmentCallback is the callback function for processing segments in real
|
||||
// time. It is called during the Process function
|
||||
type SegmentCallback func(Segment)
|
||||
|
||||
// ProgressCallback is the callback function for reporting progress during
|
||||
// processing. It is called during the Process function
|
||||
type ProgressCallback func(int)
|
||||
|
||||
// Model is the interface to a whisper model. Create a new model with the
|
||||
// function whisper.New(string)
|
||||
type Model interface {
|
||||
io.Closer
|
||||
|
||||
// Return a new speech-to-text context.
|
||||
NewContext() (Context, error)
|
||||
|
||||
// Return true if the model is multilingual.
|
||||
IsMultilingual() bool
|
||||
|
||||
// Return all languages supported.
|
||||
Languages() []string
|
||||
}
|
||||
|
||||
// Context is the speach recognition context.
|
||||
type Context interface {
|
||||
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
|
||||
SetTranslate(bool) // Set translate flag
|
||||
IsMultilingual() bool // Return true if the model is multilingual.
|
||||
Language() string // Get language
|
||||
|
||||
SetOffset(time.Duration) // Set offset
|
||||
SetDuration(time.Duration) // Set duration
|
||||
SetThreads(uint) // Set number of threads to use
|
||||
SetSplitOnWord(bool) // Set split on word flag
|
||||
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
||||
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
||||
SetMaxSegmentLength(uint) // Set max segment length in characters
|
||||
SetTokenTimestamps(bool) // Set token timestamps flag
|
||||
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
|
||||
SetAudioCtx(uint) // Set audio encoder context
|
||||
SetMaxContext(n int) // Set maximum number of text context tokens to store
|
||||
SetBeamSize(n int) // Set Beam Size
|
||||
SetEntropyThold(t float32) // Set Entropy threshold
|
||||
SetInitialPrompt(prompt string) // Set initial prompt
|
||||
SetTemperature(t float32) // Set temperature
|
||||
SetTemperatureFallback(t float32) // Set temperature incrementation
|
||||
|
||||
// Process mono audio data and return any errors.
|
||||
// If defined, newly generated segments are passed to the
|
||||
// callback function during processing.
|
||||
Process([]float32, SegmentCallback, ProgressCallback) error
|
||||
|
||||
// After process is called, return segments until the end of the stream
|
||||
// is reached, when io.EOF is returned.
|
||||
NextSegment() (Segment, error)
|
||||
|
||||
IsBEG(Token) bool // Test for "begin" token
|
||||
IsSOT(Token) bool // Test for "start of transcription" token
|
||||
IsEOT(Token) bool // Test for "end of transcription" token
|
||||
IsPREV(Token) bool // Test for "start of prev" token
|
||||
IsSOLM(Token) bool // Test for "start of lm" token
|
||||
IsNOT(Token) bool // Test for "No timestamps" token
|
||||
IsLANG(Token, string) bool // Test for token associated with a specific language
|
||||
IsText(Token) bool // Test for text token
|
||||
|
||||
// Timings
|
||||
PrintTimings()
|
||||
ResetTimings()
|
||||
|
||||
SystemInfo() string
|
||||
}
|
||||
|
||||
// Segment is the text result of a speech recognition.
|
||||
type Segment struct {
|
||||
// Segment Number
|
||||
Num int
|
||||
|
||||
// Time beginning and end timestamps for the segment.
|
||||
Start, End time.Duration
|
||||
|
||||
// The text of the segment.
|
||||
Text string
|
||||
|
||||
// The tokens of the segment.
|
||||
Tokens []Token
|
||||
}
|
||||
|
||||
// Token is a text or special token
|
||||
type Token struct {
|
||||
Id int
|
||||
Text string
|
||||
P float32
|
||||
Start, End time.Duration
|
||||
}
|
@ -1,101 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
// Bindings
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TYPES
|
||||
|
||||
type model struct {
|
||||
path string
|
||||
ctx *whisper.Context
|
||||
}
|
||||
|
||||
// Make sure model adheres to the interface
|
||||
var _ Model = (*model)(nil)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// LIFECYCLE
|
||||
|
||||
func New(path string) (Model, error) {
|
||||
model := new(model)
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
return nil, err
|
||||
} else if ctx := whisper.Whisper_init(path); ctx == nil {
|
||||
return nil, ErrUnableToLoadModel
|
||||
} else {
|
||||
model.ctx = ctx
|
||||
model.path = path
|
||||
}
|
||||
|
||||
// Return success
|
||||
return model, nil
|
||||
}
|
||||
|
||||
func (model *model) Close() error {
|
||||
if model.ctx != nil {
|
||||
model.ctx.Whisper_free()
|
||||
}
|
||||
|
||||
// Release resources
|
||||
model.ctx = nil
|
||||
|
||||
// Return success
|
||||
return nil
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// STRINGIFY
|
||||
|
||||
func (model *model) String() string {
|
||||
str := "<whisper.model"
|
||||
if model.ctx != nil {
|
||||
str += fmt.Sprintf(" model=%q", model.path)
|
||||
}
|
||||
return str + ">"
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
// Return true if model is multilingual (language and translation options are supported)
|
||||
func (model *model) IsMultilingual() bool {
|
||||
return model.ctx.Whisper_is_multilingual() != 0
|
||||
}
|
||||
|
||||
// Return all recognized languages. Initially it is set to auto-detect
|
||||
func (model *model) Languages() []string {
|
||||
result := make([]string, 0, whisper.Whisper_lang_max_id())
|
||||
for i := 0; i < whisper.Whisper_lang_max_id(); i++ {
|
||||
str := whisper.Whisper_lang_str(i)
|
||||
if model.ctx.Whisper_lang_id(str) >= 0 {
|
||||
result = append(result, str)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (model *model) NewContext() (Context, error) {
|
||||
if model.ctx == nil {
|
||||
return nil, ErrInternalAppError
|
||||
}
|
||||
|
||||
// Create new context
|
||||
params := model.ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
|
||||
params.SetTranslate(false)
|
||||
params.SetPrintSpecial(false)
|
||||
params.SetPrintProgress(false)
|
||||
params.SetPrintRealtime(false)
|
||||
params.SetPrintTimestamps(false)
|
||||
params.SetThreads(runtime.NumCPU())
|
||||
params.SetNoContext(true)
|
||||
|
||||
// Return new context
|
||||
return newContext(model, params)
|
||||
}
|
@ -1,91 +0,0 @@
|
||||
package whisper_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
assert "github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
t.Run("valid model path", func(t *testing.T) {
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
})
|
||||
|
||||
t.Run("invalid model path", func(t *testing.T) {
|
||||
invalidModelPath := "invalid-model-path.bin"
|
||||
model, err := whisper.New(invalidModelPath)
|
||||
assert.Error(err)
|
||||
assert.Nil(model)
|
||||
})
|
||||
}
|
||||
|
||||
func TestClose(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
|
||||
err = model.Close()
|
||||
assert.NoError(err)
|
||||
}
|
||||
|
||||
func TestNewContext(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
assert.NotNil(context)
|
||||
}
|
||||
|
||||
func TestIsMultilingual(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
isMultilingual := model.IsMultilingual()
|
||||
|
||||
// This returns false since
|
||||
// the model 'models/ggml-small.en.bin'
|
||||
// that is loaded is not multilingual
|
||||
assert.False(isMultilingual)
|
||||
}
|
||||
|
||||
func TestLanguages(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
expectedLanguages := []string{
|
||||
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
|
||||
"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
|
||||
"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
|
||||
"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
|
||||
"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
|
||||
"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
|
||||
"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
|
||||
"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
|
||||
"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
|
||||
}
|
||||
|
||||
actualLanguages := model.Languages()
|
||||
|
||||
assert.Equal(expectedLanguages, actualLanguages)
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
package whisper_test
|
||||
|
||||
const (
|
||||
ModelPath = "../../models/ggml-small.en.bin"
|
||||
SamplePath = "../../samples/jfk.wav"
|
||||
)
|
Binary file not shown.
@ -1,468 +0,0 @@
|
||||
package whisper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CGO
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
|
||||
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
|
||||
#include <whisper.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern void callNewSegment(void* user_data, int new);
|
||||
extern void callProgress(void* user_data, int progress);
|
||||
extern bool callEncoderBegin(void* user_data);
|
||||
|
||||
// Text segment callback
|
||||
// Called on every newly generated text segment
|
||||
// Use the whisper_full_...() functions to obtain the text segments
|
||||
static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
|
||||
if(user_data != NULL && ctx != NULL) {
|
||||
callNewSegment(user_data, n_new);
|
||||
}
|
||||
}
|
||||
|
||||
// Progress callback
|
||||
// Called on every newly generated text segment
|
||||
// Use the whisper_full_...() functions to obtain the text segments
|
||||
static void whisper_progress_cb(struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
|
||||
if(user_data != NULL && ctx != NULL) {
|
||||
callProgress(user_data, progress);
|
||||
}
|
||||
}
|
||||
|
||||
// Encoder begin callback
|
||||
// If not NULL, called before the encoder starts
|
||||
// If it returns false, the computation is aborted
|
||||
static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
|
||||
if(user_data != NULL && ctx != NULL) {
|
||||
return callEncoderBegin(user_data);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get default parameters and set callbacks
|
||||
static struct whisper_full_params whisper_full_default_params_cb(struct whisper_context* ctx, enum whisper_sampling_strategy strategy) {
|
||||
struct whisper_full_params params = whisper_full_default_params(strategy);
|
||||
params.new_segment_callback = whisper_new_segment_cb;
|
||||
params.new_segment_callback_user_data = (void*)(ctx);
|
||||
params.encoder_begin_callback = whisper_encoder_begin_cb;
|
||||
params.encoder_begin_callback_user_data = (void*)(ctx);
|
||||
params.progress_callback = whisper_progress_cb;
|
||||
params.progress_callback_user_data = (void*)(ctx);
|
||||
return params;
|
||||
}
|
||||
*/
|
||||
import "C"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TYPES
|
||||
|
||||
type (
|
||||
Context C.struct_whisper_context
|
||||
Token C.whisper_token
|
||||
TokenData C.struct_whisper_token_data
|
||||
SamplingStrategy C.enum_whisper_sampling_strategy
|
||||
Params C.struct_whisper_full_params
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// GLOBALS
|
||||
|
||||
const (
|
||||
SAMPLING_GREEDY SamplingStrategy = C.WHISPER_SAMPLING_GREEDY
|
||||
SAMPLING_BEAM_SEARCH SamplingStrategy = C.WHISPER_SAMPLING_BEAM_SEARCH
|
||||
)
|
||||
|
||||
const (
|
||||
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
|
||||
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
|
||||
NumFFT = C.WHISPER_N_FFT
|
||||
HopLength = C.WHISPER_HOP_LENGTH
|
||||
ChunkSize = C.WHISPER_CHUNK_SIZE
|
||||
)
|
||||
|
||||
var (
|
||||
ErrTokenizerFailed = errors.New("whisper_tokenize failed")
|
||||
ErrAutoDetectFailed = errors.New("whisper_lang_auto_detect failed")
|
||||
ErrConversionFailed = errors.New("whisper_convert failed")
|
||||
ErrInvalidLanguage = errors.New("invalid language")
|
||||
)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PUBLIC METHODS
|
||||
|
||||
// Allocates all memory needed for the model and loads the model from the given file.
|
||||
// Returns NULL on failure.
|
||||
func Whisper_init(path string) *Context {
|
||||
cPath := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(cPath))
|
||||
if ctx := C.whisper_init_from_file_with_params(cPath, C.whisper_context_default_params()); ctx != nil {
|
||||
return (*Context)(ctx)
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Frees all memory allocated by the model.
|
||||
func (ctx *Context) Whisper_free() {
|
||||
C.whisper_free((*C.struct_whisper_context)(ctx))
|
||||
}
|
||||
|
||||
// Convert RAW PCM audio to log mel spectrogram.
|
||||
// The resulting spectrogram is stored inside the provided whisper context.
|
||||
func (ctx *Context) Whisper_pcm_to_mel(data []float32, threads int) error {
|
||||
if C.whisper_pcm_to_mel((*C.struct_whisper_context)(ctx), (*C.float)(&data[0]), C.int(len(data)), C.int(threads)) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
||||
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||
// n_mel must be 80
|
||||
func (ctx *Context) Whisper_set_mel(data []float32, n_mel int) error {
|
||||
if C.whisper_set_mel((*C.struct_whisper_context)(ctx), (*C.float)(&data[0]), C.int(len(data)), C.int(n_mel)) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
|
||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||
// offset can be used to specify the offset of the first frame in the spectrogram.
|
||||
func (ctx *Context) Whisper_encode(offset, threads int) error {
|
||||
if C.whisper_encode((*C.struct_whisper_context)(ctx), C.int(offset), C.int(threads)) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
||||
// Make sure to call whisper_encode() first.
|
||||
// tokens + n_tokens is the provided context for the decoder.
|
||||
// n_past is the number of tokens to use from previous decoder calls.
|
||||
func (ctx *Context) Whisper_decode(tokens []Token, past, threads int) error {
|
||||
if C.whisper_decode((*C.struct_whisper_context)(ctx), (*C.whisper_token)(&tokens[0]), C.int(len(tokens)), C.int(past), C.int(threads)) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the provided text into tokens. The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success
|
||||
func (ctx *Context) Whisper_tokenize(text string, tokens []Token) (int, error) {
|
||||
cText := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(cText))
|
||||
if n := C.whisper_tokenize((*C.struct_whisper_context)(ctx), cText, (*C.whisper_token)(&tokens[0]), C.int(len(tokens))); n >= 0 {
|
||||
return int(n), nil
|
||||
} else {
|
||||
return 0, ErrTokenizerFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Return the id of the specified language, returns -1 if not found
|
||||
// Examples:
|
||||
//
|
||||
// "de" -> 2
|
||||
// "german" -> 2
|
||||
func (ctx *Context) Whisper_lang_id(lang string) int {
|
||||
return int(C.whisper_lang_id(C.CString(lang)))
|
||||
}
|
||||
|
||||
// Largest language id (i.e. number of available languages - 1)
|
||||
func Whisper_lang_max_id() int {
|
||||
return int(C.whisper_lang_max_id())
|
||||
}
|
||||
|
||||
// Return the short string of the specified language id (e.g. 2 -> "de"),
|
||||
// returns empty string if not found
|
||||
func Whisper_lang_str(id int) string {
|
||||
return C.GoString(C.whisper_lang_str(C.int(id)))
|
||||
}
|
||||
|
||||
// Use mel data at offset_ms to try and auto-detect the spoken language
|
||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||
// Returns the probabilities of all languages.
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
||||
func (ctx *Context) Whisper_lang_auto_detect(offset_ms, n_threads int) ([]float32, error) {
|
||||
probs := make([]float32, Whisper_lang_max_id()+1)
|
||||
if n := int(C.whisper_lang_auto_detect((*C.struct_whisper_context)(ctx), C.int(offset_ms), C.int(n_threads), (*C.float)(&probs[0]))); n < 0 {
|
||||
return nil, ErrAutoDetectFailed
|
||||
} else {
|
||||
return probs, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (ctx *Context) Whisper_n_len() int {
|
||||
return int(C.whisper_n_len((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
func (ctx *Context) Whisper_n_vocab() int {
|
||||
return int(C.whisper_n_vocab((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
func (ctx *Context) Whisper_n_text_ctx() int {
|
||||
return int(C.whisper_n_text_ctx((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
func (ctx *Context) Whisper_n_audio_ctx() int {
|
||||
return int(C.whisper_n_audio_ctx((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
func (ctx *Context) Whisper_is_multilingual() int {
|
||||
return int(C.whisper_is_multilingual((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// The probabilities for the next token
|
||||
//func (ctx *Whisper_context) Whisper_get_probs() []float32 {
|
||||
// return (*[1 << 30]float32)(unsafe.Pointer(C.whisper_get_probs((*C.struct_whisper_context)(ctx))))[:ctx.Whisper_n_vocab()]
|
||||
//}
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
func (ctx *Context) Whisper_token_to_str(token Token) string {
|
||||
return C.GoString(C.whisper_token_to_str((*C.struct_whisper_context)(ctx), C.whisper_token(token)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_eot() Token {
|
||||
return Token(C.whisper_token_eot((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_sot() Token {
|
||||
return Token(C.whisper_token_sot((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_prev() Token {
|
||||
return Token(C.whisper_token_prev((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_solm() Token {
|
||||
return Token(C.whisper_token_solm((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_not() Token {
|
||||
return Token(C.whisper_token_not((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_beg() Token {
|
||||
return Token(C.whisper_token_beg((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
func (ctx *Context) Whisper_token_lang(lang_id int) Token {
|
||||
return Token(C.whisper_token_lang((*C.struct_whisper_context)(ctx), C.int(lang_id)))
|
||||
}
|
||||
|
||||
// Task tokens
|
||||
func (ctx *Context) Whisper_token_translate() Token {
|
||||
return Token(C.whisper_token_translate((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Task tokens
|
||||
func (ctx *Context) Whisper_token_transcribe() Token {
|
||||
return Token(C.whisper_token_transcribe((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Performance information
|
||||
func (ctx *Context) Whisper_print_timings() {
|
||||
C.whisper_print_timings((*C.struct_whisper_context)(ctx))
|
||||
}
|
||||
|
||||
// Performance information
|
||||
func (ctx *Context) Whisper_reset_timings() {
|
||||
C.whisper_reset_timings((*C.struct_whisper_context)(ctx))
|
||||
}
|
||||
|
||||
// Print system information
|
||||
func Whisper_print_system_info() string {
|
||||
return C.GoString(C.whisper_print_system_info())
|
||||
}
|
||||
|
||||
// Return default parameters for a strategy
|
||||
func (ctx *Context) Whisper_full_default_params(strategy SamplingStrategy) Params {
|
||||
// Get default parameters
|
||||
return Params(C.whisper_full_default_params_cb((*C.struct_whisper_context)(ctx), C.enum_whisper_sampling_strategy(strategy)))
|
||||
}
|
||||
|
||||
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
// Uses the specified decoding strategy to obtain the text.
|
||||
func (ctx *Context) Whisper_full(
|
||||
params Params,
|
||||
samples []float32,
|
||||
encoderBeginCallback func() bool,
|
||||
newSegmentCallback func(int),
|
||||
progressCallback func(int),
|
||||
) error {
|
||||
registerEncoderBeginCallback(ctx, encoderBeginCallback)
|
||||
registerNewSegmentCallback(ctx, newSegmentCallback)
|
||||
registerProgressCallback(ctx, progressCallback)
|
||||
defer registerEncoderBeginCallback(ctx, nil)
|
||||
defer registerNewSegmentCallback(ctx, nil)
|
||||
defer registerProgressCallback(ctx, nil)
|
||||
if C.whisper_full((*C.struct_whisper_context)(ctx), (C.struct_whisper_full_params)(params), (*C.float)(&samples[0]), C.int(len(samples))) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
||||
// It seems this approach can offer some speedup in some cases.
|
||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
func (ctx *Context) Whisper_full_parallel(params Params, samples []float32, processors int, encoderBeginCallback func() bool, newSegmentCallback func(int)) error {
|
||||
registerEncoderBeginCallback(ctx, encoderBeginCallback)
|
||||
registerNewSegmentCallback(ctx, newSegmentCallback)
|
||||
defer registerEncoderBeginCallback(ctx, nil)
|
||||
defer registerNewSegmentCallback(ctx, nil)
|
||||
|
||||
if C.whisper_full_parallel((*C.struct_whisper_context)(ctx), (C.struct_whisper_full_params)(params), (*C.float)(&samples[0]), C.int(len(samples)), C.int(processors)) == 0 {
|
||||
return nil
|
||||
} else {
|
||||
return ErrConversionFailed
|
||||
}
|
||||
}
|
||||
|
||||
// Return the id of the autodetected language, returns -1 if not found
|
||||
// Added to whisper.cpp in
|
||||
// https://github.com/ggerganov/whisper.cpp/commit/a1c1583cc7cd8b75222857afc936f0638c5683d6
|
||||
//
|
||||
// Examples:
|
||||
//
|
||||
// "de" -> 2
|
||||
// "german" -> 2
|
||||
func (ctx *Context) Whisper_full_lang_id() int {
|
||||
return int(C.whisper_full_lang_id((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Number of generated text segments.
|
||||
// A segment can be a few words, a sentence, or even a paragraph.
|
||||
func (ctx *Context) Whisper_full_n_segments() int {
|
||||
return int(C.whisper_full_n_segments((*C.struct_whisper_context)(ctx)))
|
||||
}
|
||||
|
||||
// Get the start and end time of the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_segment_t0(segment int) int64 {
|
||||
return int64(C.whisper_full_get_segment_t0((*C.struct_whisper_context)(ctx), C.int(segment)))
|
||||
}
|
||||
|
||||
// Get the start and end time of the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_segment_t1(segment int) int64 {
|
||||
return int64(C.whisper_full_get_segment_t1((*C.struct_whisper_context)(ctx), C.int(segment)))
|
||||
}
|
||||
|
||||
// Get the text of the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_segment_text(segment int) string {
|
||||
return C.GoString(C.whisper_full_get_segment_text((*C.struct_whisper_context)(ctx), C.int(segment)))
|
||||
}
|
||||
|
||||
// Get number of tokens in the specified segment.
|
||||
func (ctx *Context) Whisper_full_n_tokens(segment int) int {
|
||||
return int(C.whisper_full_n_tokens((*C.struct_whisper_context)(ctx), C.int(segment)))
|
||||
}
|
||||
|
||||
// Get the token text of the specified token index in the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_token_text(segment int, token int) string {
|
||||
return C.GoString(C.whisper_full_get_token_text((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
|
||||
}
|
||||
|
||||
// Get the token of the specified token index in the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_token_id(segment int, token int) Token {
|
||||
return Token(C.whisper_full_get_token_id((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
|
||||
}
|
||||
|
||||
// Get token data for the specified token in the specified segment.
|
||||
// This contains probabilities, timestamps, etc.
|
||||
func (ctx *Context) Whisper_full_get_token_data(segment int, token int) TokenData {
|
||||
return TokenData(C.whisper_full_get_token_data((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
|
||||
}
|
||||
|
||||
// Get the probability of the specified token in the specified segment.
|
||||
func (ctx *Context) Whisper_full_get_token_p(segment int, token int) float32 {
|
||||
return float32(C.whisper_full_get_token_p((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// CALLBACKS
|
||||
|
||||
var (
|
||||
cbNewSegment = make(map[unsafe.Pointer]func(int))
|
||||
cbProgress = make(map[unsafe.Pointer]func(int))
|
||||
cbEncoderBegin = make(map[unsafe.Pointer]func() bool)
|
||||
)
|
||||
|
||||
func registerNewSegmentCallback(ctx *Context, fn func(int)) {
|
||||
if fn == nil {
|
||||
delete(cbNewSegment, unsafe.Pointer(ctx))
|
||||
} else {
|
||||
cbNewSegment[unsafe.Pointer(ctx)] = fn
|
||||
}
|
||||
}
|
||||
|
||||
func registerProgressCallback(ctx *Context, fn func(int)) {
|
||||
if fn == nil {
|
||||
delete(cbProgress, unsafe.Pointer(ctx))
|
||||
} else {
|
||||
cbProgress[unsafe.Pointer(ctx)] = fn
|
||||
}
|
||||
}
|
||||
|
||||
func registerEncoderBeginCallback(ctx *Context, fn func() bool) {
|
||||
if fn == nil {
|
||||
delete(cbEncoderBegin, unsafe.Pointer(ctx))
|
||||
} else {
|
||||
cbEncoderBegin[unsafe.Pointer(ctx)] = fn
|
||||
}
|
||||
}
|
||||
|
||||
//export callNewSegment
|
||||
func callNewSegment(user_data unsafe.Pointer, new C.int) {
|
||||
if fn, ok := cbNewSegment[user_data]; ok {
|
||||
fn(int(new))
|
||||
}
|
||||
}
|
||||
|
||||
//export callProgress
|
||||
func callProgress(user_data unsafe.Pointer, progress C.int) {
|
||||
if fn, ok := cbProgress[user_data]; ok {
|
||||
fn(int(progress))
|
||||
}
|
||||
}
|
||||
|
||||
//export callEncoderBegin
|
||||
func callEncoderBegin(user_data unsafe.Pointer) C.bool {
|
||||
if fn, ok := cbEncoderBegin[user_data]; ok {
|
||||
if fn() {
|
||||
return C.bool(true)
|
||||
} else {
|
||||
return C.bool(false)
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t TokenData) T0() int64 {
|
||||
return int64(t.t0)
|
||||
}
|
||||
|
||||
func (t TokenData) T1() int64 {
|
||||
return int64(t.t1)
|
||||
}
|
||||
|
||||
func (t TokenData) Id() Token {
|
||||
return Token(t.id)
|
||||
}
|
@ -1,113 +0,0 @@
|
||||
package whisper_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
// Packages
|
||||
whisper "github.com/ggerganov/whisper.cpp/bindings/go"
|
||||
wav "github.com/go-audio/wav"
|
||||
assert "github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const (
|
||||
ModelPath = "models/ggml-small.en.bin"
|
||||
SamplePath = "samples/jfk.wav"
|
||||
)
|
||||
|
||||
func Test_Whisper_000(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
|
||||
t.Skip("Skipping test, model not found:", ModelPath)
|
||||
}
|
||||
ctx := whisper.Whisper_init(ModelPath)
|
||||
assert.NotNil(ctx)
|
||||
ctx.Whisper_free()
|
||||
}
|
||||
|
||||
func Test_Whisper_001(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
|
||||
t.Skip("Skipping test, model not found:", ModelPath)
|
||||
}
|
||||
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
|
||||
t.Skip("Skipping test, sample not found:", SamplePath)
|
||||
}
|
||||
|
||||
// Open samples
|
||||
fh, err := os.Open(SamplePath)
|
||||
assert.NoError(err)
|
||||
defer fh.Close()
|
||||
|
||||
// Read samples
|
||||
d := wav.NewDecoder(fh)
|
||||
buf, err := d.FullPCMBuffer()
|
||||
assert.NoError(err)
|
||||
|
||||
// Run whisper
|
||||
ctx := whisper.Whisper_init(ModelPath)
|
||||
assert.NotNil(ctx)
|
||||
defer ctx.Whisper_free()
|
||||
params := ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
err = ctx.Whisper_full(params, data, nil, nil, nil)
|
||||
assert.NoError(err)
|
||||
|
||||
// Print out tokens
|
||||
num_segments := ctx.Whisper_full_n_segments()
|
||||
assert.GreaterOrEqual(num_segments, 1)
|
||||
for i := 0; i < num_segments; i++ {
|
||||
str := ctx.Whisper_full_get_segment_text(i)
|
||||
assert.NotEmpty(str)
|
||||
t0 := time.Duration(ctx.Whisper_full_get_segment_t0(i)) * time.Millisecond
|
||||
t1 := time.Duration(ctx.Whisper_full_get_segment_t1(i)) * time.Millisecond
|
||||
t.Logf("[%6s->%-6s] %q", t0, t1, str)
|
||||
}
|
||||
}
|
||||
|
||||
func Test_Whisper_002(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
for i := 0; i < whisper.Whisper_lang_max_id(); i++ {
|
||||
str := whisper.Whisper_lang_str(i)
|
||||
assert.NotEmpty(str)
|
||||
t.Log(str)
|
||||
}
|
||||
}
|
||||
|
||||
func Test_Whisper_003(t *testing.T) {
|
||||
threads := runtime.NumCPU()
|
||||
assert := assert.New(t)
|
||||
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
|
||||
t.Skip("Skipping test, model not found:", ModelPath)
|
||||
}
|
||||
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
|
||||
t.Skip("Skipping test, sample not found:", SamplePath)
|
||||
}
|
||||
|
||||
// Open samples
|
||||
fh, err := os.Open(SamplePath)
|
||||
assert.NoError(err)
|
||||
defer fh.Close()
|
||||
|
||||
// Read samples
|
||||
d := wav.NewDecoder(fh)
|
||||
buf, err := d.FullPCMBuffer()
|
||||
assert.NoError(err)
|
||||
|
||||
// Make the model
|
||||
ctx := whisper.Whisper_init(ModelPath)
|
||||
assert.NotNil(ctx)
|
||||
defer ctx.Whisper_free()
|
||||
|
||||
// Get MEL
|
||||
assert.NoError(ctx.Whisper_pcm_to_mel(buf.AsFloat32Buffer().Data, threads))
|
||||
|
||||
// Get Languages
|
||||
languages, err := ctx.Whisper_lang_auto_detect(0, threads)
|
||||
assert.NoError(err)
|
||||
for i, p := range languages {
|
||||
t.Logf("%s: %f", whisper.Whisper_lang_str(i), p)
|
||||
}
|
||||
}
|
1
bindings/ios
Submodule
1
bindings/ios
Submodule
Submodule bindings/ios added at 4bda8e9d80
124
bindings/java/.idea/uiDesigner.xml
generated
124
bindings/java/.idea/uiDesigner.xml
generated
@ -1,124 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Palette2">
|
||||
<group name="Swing">
|
||||
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
|
||||
</item>
|
||||
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
|
||||
</item>
|
||||
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
|
||||
</item>
|
||||
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.svg" removable="false" auto-create-binding="false" can-attach-label="true">
|
||||
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
|
||||
</item>
|
||||
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
|
||||
<initial-values>
|
||||
<property name="text" value="Button" />
|
||||
</initial-values>
|
||||
</item>
|
||||
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
|
||||
<initial-values>
|
||||
<property name="text" value="RadioButton" />
|
||||
</initial-values>
|
||||
</item>
|
||||
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
|
||||
<initial-values>
|
||||
<property name="text" value="CheckBox" />
|
||||
</initial-values>
|
||||
</item>
|
||||
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
|
||||
<initial-values>
|
||||
<property name="text" value="Label" />
|
||||
</initial-values>
|
||||
</item>
|
||||
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
|
||||
<preferred-size width="150" height="-1" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
|
||||
<preferred-size width="150" height="-1" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
|
||||
<preferred-size width="150" height="-1" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
|
||||
</item>
|
||||
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
|
||||
<preferred-size width="150" height="50" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
|
||||
<preferred-size width="200" height="200" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
|
||||
<preferred-size width="200" height="200" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.svg" removable="false" auto-create-binding="true" can-attach-label="true">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
|
||||
</item>
|
||||
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
|
||||
</item>
|
||||
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
|
||||
</item>
|
||||
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
|
||||
</item>
|
||||
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
|
||||
<preferred-size width="-1" height="20" />
|
||||
</default-constraints>
|
||||
</item>
|
||||
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
|
||||
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
|
||||
</item>
|
||||
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
|
||||
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
|
||||
</item>
|
||||
</group>
|
||||
</component>
|
||||
</project>
|
@ -1,71 +0,0 @@
|
||||
# Java JNI bindings for Whisper
|
||||
|
||||
This package provides Java JNI bindings for whisper.cpp. They have been tested on:
|
||||
|
||||
* <strike>Darwin (OS X) 12.6 on x64_64</strike>
|
||||
* Ubuntu on x86_64
|
||||
* Windows on x86_64
|
||||
|
||||
The "low level" bindings are in `WhisperCppJnaLibrary`. The most simple usage is as follows:
|
||||
|
||||
JNA will attempt to load the `whispercpp` shared library from:
|
||||
|
||||
- jna.library.path
|
||||
- jna.platform.library
|
||||
- ~/Library/Frameworks
|
||||
- /Library/Frameworks
|
||||
- /System/Library/Frameworks
|
||||
- classpath
|
||||
|
||||
```java
|
||||
import io.github.ggerganov.whispercpp.WhisperCpp;
|
||||
|
||||
public class Example {
|
||||
|
||||
public static void main(String[] args) {
|
||||
WhisperCpp whisper = new WhisperCpp();
|
||||
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
|
||||
// or you can provide the absolute path to the model file.
|
||||
long context = whisper.initContext("base.en");
|
||||
try {
|
||||
var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
// custom configuration if required
|
||||
whisperParams.temperature_inc = 0f;
|
||||
|
||||
var samples = readAudio(); // divide each value by 32767.0f
|
||||
whisper.fullTranscribe(whisperParams, samples);
|
||||
|
||||
int segmentCount = whisper.getTextSegmentCount(context);
|
||||
for (int i = 0; i < segmentCount; i++) {
|
||||
String text = whisper.getTextSegment(context, i);
|
||||
System.out.println(segment.getText());
|
||||
}
|
||||
} finally {
|
||||
whisper.freeContext(context);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Building & Testing
|
||||
|
||||
In order to build, you need to have the JDK 8 or higher installed. Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/java
|
||||
|
||||
./gradlew build
|
||||
```
|
||||
|
||||
You need to have the `whisper` library in your [JNA library path](https://java-native-access.github.io/jna/4.2.1/com/sun/jna/NativeLibrary.html). On Windows the dll is included in the jar and you can update it:
|
||||
|
||||
```bash
|
||||
copy /y ..\..\build\bin\Release\whisper.dll build\generated\resources\main\win32-x86-64\whisper.dll
|
||||
```
|
||||
|
||||
|
||||
## License
|
||||
|
||||
The license for the Java bindings is the same as the license for the rest of the whisper.cpp project, which is the MIT License. See the `LICENSE` file for more details.
|
||||
|
@ -1,133 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'java-library'
|
||||
id 'maven-publish'
|
||||
id 'signing'
|
||||
}
|
||||
|
||||
archivesBaseName = 'whispercpp'
|
||||
group = 'io.github.ggerganov'
|
||||
version = '1.4.0'
|
||||
|
||||
|
||||
sourceCompatibility = 1.8
|
||||
targetCompatibility = 1.8
|
||||
|
||||
sourceSets {
|
||||
main {
|
||||
resources {
|
||||
srcDirs = ['src/main/resources', 'build/generated/resources/main']
|
||||
}
|
||||
}
|
||||
test {
|
||||
runtimeClasspath += files('build/generated/resources/main')
|
||||
}
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperDynlib', Copy) {
|
||||
from '../../build'
|
||||
include 'libwhisper.dynlib'
|
||||
into 'build/generated/resources/main/darwin'
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperSo', Copy) {
|
||||
from '../../build'
|
||||
include 'libwhisper.so'
|
||||
into 'build/generated/resources/main/linux-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyWhisperDll', Copy) {
|
||||
from '../../build/Release'
|
||||
include 'whisper.dll'
|
||||
into 'build/generated/resources/main/windows-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyLibs') {
|
||||
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDll
|
||||
}
|
||||
|
||||
test {
|
||||
systemProperty 'jna.library.path', project.file('build/generated/resources/main').absolutePath
|
||||
}
|
||||
|
||||
java {
|
||||
withSourcesJar()
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
jar {
|
||||
exclude '**/whisper_java.exp', '**/whisper_java.lib'
|
||||
}
|
||||
|
||||
javadoc {
|
||||
options.addStringOption('Xdoclint:none', '-quiet')
|
||||
}
|
||||
|
||||
tasks.withType(Test) {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation "net.java.dev.jna:jna:5.13.0"
|
||||
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
|
||||
testImplementation "org.assertj:assertj-core:3.24.2"
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenCentral()
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
mavenJava(MavenPublication) {
|
||||
artifactId = 'whispercpp'
|
||||
from components.java
|
||||
pom {
|
||||
name = 'whispercpp'
|
||||
description = "Java JNA bindings for OpenAI's Whisper model, implemented in C/C++"
|
||||
url = 'https://github.com/ggerganov/whisper.cpp'
|
||||
licenses {
|
||||
license {
|
||||
name = 'MIT licence'
|
||||
url = 'https://raw.githubusercontent.com/ggerganov/whisper.cpp/master/LICENSE'
|
||||
}
|
||||
}
|
||||
developers {
|
||||
developer {
|
||||
id = 'ggerganov'
|
||||
name = 'Georgi Gerganov'
|
||||
email = 'ggerganov@gmail.com'
|
||||
}
|
||||
developer {
|
||||
id = 'nalbion'
|
||||
name = 'Nicholas Albion'
|
||||
email = 'nalbion@yahoo.com'
|
||||
}
|
||||
}
|
||||
scm {
|
||||
connection = 'scm:git:git://github.com/ggerganov/whisper.cpp.git'
|
||||
url = 'https://github.com/ggerganov/whisper.cpp'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
repositories {
|
||||
maven {
|
||||
def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
|
||||
def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
|
||||
url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
|
||||
credentials {
|
||||
username = System.getenv("MAVEN_USERNAME")
|
||||
password = System.getenv("MAVEN_PASSWORD")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
signing {
|
||||
def signingKey = System.getenv("PGP_SECRET")
|
||||
def signingPassword = System.getenv("PGP_PASSPHRASE")
|
||||
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||
sign publishing.publications.mavenJava
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
org.gradle.jvmargs=-Xms256m -Xmx1024m
|
||||
system.include.dir=/usr/include
|
||||
#system.local.include.dir=../../include
|
||||
system.local.include.dir=./build/generated/sources/headers/java/main
|
||||
jni.include.dir=/usr/lib/jvm/java-8-openjdk-amd64/include/
|
||||
jni.lib.dir=/usr/lib/jvm/java-8-openjdk-amd64/lib/
|
BIN
bindings/java/gradle/wrapper/gradle-wrapper.jar
vendored
BIN
bindings/java/gradle/wrapper/gradle-wrapper.jar
vendored
Binary file not shown.
@ -1,6 +0,0 @@
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.1-bin.zip
|
||||
networkTimeout=10000
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
244
bindings/java/gradlew
vendored
244
bindings/java/gradlew
vendored
@ -1,244 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
#
|
||||
# Copyright © 2015-2021 the original authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Gradle start up script for POSIX generated by Gradle.
|
||||
#
|
||||
# Important for running:
|
||||
#
|
||||
# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
|
||||
# noncompliant, but you have some other compliant shell such as ksh or
|
||||
# bash, then to run this script, type that shell name before the whole
|
||||
# command line, like:
|
||||
#
|
||||
# ksh Gradle
|
||||
#
|
||||
# Busybox and similar reduced shells will NOT work, because this script
|
||||
# requires all of these POSIX shell features:
|
||||
# * functions;
|
||||
# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
|
||||
# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
|
||||
# * compound commands having a testable exit status, especially «case»;
|
||||
# * various built-in commands including «command», «set», and «ulimit».
|
||||
#
|
||||
# Important for patching:
|
||||
#
|
||||
# (2) This script targets any POSIX shell, so it avoids extensions provided
|
||||
# by Bash, Ksh, etc; in particular arrays are avoided.
|
||||
#
|
||||
# The "traditional" practice of packing multiple parameters into a
|
||||
# space-separated string is a well documented source of bugs and security
|
||||
# problems, so this is (mostly) avoided, by progressively accumulating
|
||||
# options in "$@", and eventually passing that to Java.
|
||||
#
|
||||
# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
|
||||
# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
|
||||
# see the in-line comments for details.
|
||||
#
|
||||
# There are tweaks for specific operating systems such as AIX, CygWin,
|
||||
# Darwin, MinGW, and NonStop.
|
||||
#
|
||||
# (3) This script is generated from the Groovy template
|
||||
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
|
||||
# within the Gradle project.
|
||||
#
|
||||
# You can find Gradle at https://github.com/gradle/gradle/.
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
# Attempt to set APP_HOME
|
||||
|
||||
# Resolve links: $0 may be a link
|
||||
app_path=$0
|
||||
|
||||
# Need this for daisy-chained symlinks.
|
||||
while
|
||||
APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
|
||||
[ -h "$app_path" ]
|
||||
do
|
||||
ls=$( ls -ld "$app_path" )
|
||||
link=${ls#*' -> '}
|
||||
case $link in #(
|
||||
/*) app_path=$link ;; #(
|
||||
*) app_path=$APP_HOME$link ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# This is normally unused
|
||||
# shellcheck disable=SC2034
|
||||
APP_BASE_NAME=${0##*/}
|
||||
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
|
||||
|
||||
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||
|
||||
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||
MAX_FD=maximum
|
||||
|
||||
warn () {
|
||||
echo "$*"
|
||||
} >&2
|
||||
|
||||
die () {
|
||||
echo
|
||||
echo "$*"
|
||||
echo
|
||||
exit 1
|
||||
} >&2
|
||||
|
||||
# OS specific support (must be 'true' or 'false').
|
||||
cygwin=false
|
||||
msys=false
|
||||
darwin=false
|
||||
nonstop=false
|
||||
case "$( uname )" in #(
|
||||
CYGWIN* ) cygwin=true ;; #(
|
||||
Darwin* ) darwin=true ;; #(
|
||||
MSYS* | MINGW* ) msys=true ;; #(
|
||||
NONSTOP* ) nonstop=true ;;
|
||||
esac
|
||||
|
||||
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||
|
||||
|
||||
# Determine the Java command to use to start the JVM.
|
||||
if [ -n "$JAVA_HOME" ] ; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD=$JAVA_HOME/jre/sh/java
|
||||
else
|
||||
JAVACMD=$JAVA_HOME/bin/java
|
||||
fi
|
||||
if [ ! -x "$JAVACMD" ] ; then
|
||||
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
else
|
||||
JAVACMD=java
|
||||
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
|
||||
# Increase the maximum file descriptors if we can.
|
||||
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
|
||||
case $MAX_FD in #(
|
||||
max*)
|
||||
# In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
|
||||
# shellcheck disable=SC3045
|
||||
MAX_FD=$( ulimit -H -n ) ||
|
||||
warn "Could not query maximum file descriptor limit"
|
||||
esac
|
||||
case $MAX_FD in #(
|
||||
'' | soft) :;; #(
|
||||
*)
|
||||
# In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
|
||||
# shellcheck disable=SC3045
|
||||
ulimit -n "$MAX_FD" ||
|
||||
warn "Could not set maximum file descriptor limit to $MAX_FD"
|
||||
esac
|
||||
fi
|
||||
|
||||
# Collect all arguments for the java command, stacking in reverse order:
|
||||
# * args from the command line
|
||||
# * the main class name
|
||||
# * -classpath
|
||||
# * -D...appname settings
|
||||
# * --module-path (only if needed)
|
||||
# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
|
||||
|
||||
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||
if "$cygwin" || "$msys" ; then
|
||||
APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
|
||||
CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
|
||||
|
||||
JAVACMD=$( cygpath --unix "$JAVACMD" )
|
||||
|
||||
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||
for arg do
|
||||
if
|
||||
case $arg in #(
|
||||
-*) false ;; # don't mess with options #(
|
||||
/?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
|
||||
[ -e "$t" ] ;; #(
|
||||
*) false ;;
|
||||
esac
|
||||
then
|
||||
arg=$( cygpath --path --ignore --mixed "$arg" )
|
||||
fi
|
||||
# Roll the args list around exactly as many times as the number of
|
||||
# args, so each arg winds up back in the position where it started, but
|
||||
# possibly modified.
|
||||
#
|
||||
# NB: a `for` loop captures its iteration list before it begins, so
|
||||
# changing the positional parameters here affects neither the number of
|
||||
# iterations, nor the values presented in `arg`.
|
||||
shift # remove old arg
|
||||
set -- "$@" "$arg" # push replacement arg
|
||||
done
|
||||
fi
|
||||
|
||||
# Collect all arguments for the java command;
|
||||
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
|
||||
# shell script including quotes and variable substitutions, so put them in
|
||||
# double quotes to make sure that they get re-expanded; and
|
||||
# * put everything else in single quotes, so that it's not re-expanded.
|
||||
|
||||
set -- \
|
||||
"-Dorg.gradle.appname=$APP_BASE_NAME" \
|
||||
-classpath "$CLASSPATH" \
|
||||
org.gradle.wrapper.GradleWrapperMain \
|
||||
"$@"
|
||||
|
||||
# Stop when "xargs" is not available.
|
||||
if ! command -v xargs >/dev/null 2>&1
|
||||
then
|
||||
die "xargs is not available"
|
||||
fi
|
||||
|
||||
# Use "xargs" to parse quoted args.
|
||||
#
|
||||
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
|
||||
#
|
||||
# In Bash we could simply go:
|
||||
#
|
||||
# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
|
||||
# set -- "${ARGS[@]}" "$@"
|
||||
#
|
||||
# but POSIX shell has neither arrays nor command substitution, so instead we
|
||||
# post-process each arg (as a line of input to sed) to backslash-escape any
|
||||
# character that might be a shell metacharacter, then use eval to reverse
|
||||
# that process (while maintaining the separation between arguments), and wrap
|
||||
# the whole thing up as a single "set" statement.
|
||||
#
|
||||
# This will of course break if any of these variables contains a newline or
|
||||
# an unmatched quote.
|
||||
#
|
||||
|
||||
eval "set -- $(
|
||||
printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
|
||||
xargs -n1 |
|
||||
sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
|
||||
tr '\n' ' '
|
||||
)" '"$@"'
|
||||
|
||||
exec "$JAVACMD" "$@"
|
92
bindings/java/gradlew.bat
vendored
92
bindings/java/gradlew.bat
vendored
@ -1,92 +0,0 @@
|
||||
@rem
|
||||
@rem Copyright 2015 the original author or authors.
|
||||
@rem
|
||||
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@rem you may not use this file except in compliance with the License.
|
||||
@rem You may obtain a copy of the License at
|
||||
@rem
|
||||
@rem https://www.apache.org/licenses/LICENSE-2.0
|
||||
@rem
|
||||
@rem Unless required by applicable law or agreed to in writing, software
|
||||
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@rem See the License for the specific language governing permissions and
|
||||
@rem limitations under the License.
|
||||
@rem
|
||||
|
||||
@if "%DEBUG%"=="" @echo off
|
||||
@rem ##########################################################################
|
||||
@rem
|
||||
@rem Gradle startup script for Windows
|
||||
@rem
|
||||
@rem ##########################################################################
|
||||
|
||||
@rem Set local scope for the variables with windows NT shell
|
||||
if "%OS%"=="Windows_NT" setlocal
|
||||
|
||||
set DIRNAME=%~dp0
|
||||
if "%DIRNAME%"=="" set DIRNAME=.
|
||||
@rem This is normally unused
|
||||
set APP_BASE_NAME=%~n0
|
||||
set APP_HOME=%DIRNAME%
|
||||
|
||||
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
||||
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
||||
|
||||
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
||||
|
||||
@rem Find java.exe
|
||||
if defined JAVA_HOME goto findJavaFromJavaHome
|
||||
|
||||
set JAVA_EXE=java.exe
|
||||
%JAVA_EXE% -version >NUL 2>&1
|
||||
if %ERRORLEVEL% equ 0 goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:findJavaFromJavaHome
|
||||
set JAVA_HOME=%JAVA_HOME:"=%
|
||||
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
||||
|
||||
if exist "%JAVA_EXE%" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:execute
|
||||
@rem Setup the command line
|
||||
|
||||
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
||||
|
||||
|
||||
@rem Execute Gradle
|
||||
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
|
||||
|
||||
:end
|
||||
@rem End local scope for the variables with windows NT shell
|
||||
if %ERRORLEVEL% equ 0 goto mainEnd
|
||||
|
||||
:fail
|
||||
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
||||
rem the _cmd.exe /c_ return code!
|
||||
set EXIT_CODE=%ERRORLEVEL%
|
||||
if %EXIT_CODE% equ 0 set EXIT_CODE=1
|
||||
if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
|
||||
exit /b %EXIT_CODE%
|
||||
|
||||
:mainEnd
|
||||
if "%OS%"=="Windows_NT" endlocal
|
||||
|
||||
:omega
|
@ -1 +0,0 @@
|
||||
rootProject.name = "whispercpp"
|
@ -1,41 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import com.sun.jna.Structure;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
import io.github.ggerganov.whispercpp.ggml.GgmlType;
|
||||
import io.github.ggerganov.whispercpp.WhisperModel;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperContext extends Structure {
|
||||
int t_load_us = 0;
|
||||
int t_start_us = 0;
|
||||
|
||||
/** weight type (FP32 / FP16 / QX) */
|
||||
GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
||||
/** intermediate type (FP32 or FP16) */
|
||||
GgmlType itype = GgmlType.GGML_TYPE_F16;
|
||||
|
||||
// WhisperModel model;
|
||||
public PointerByReference model;
|
||||
// whisper_vocab vocab;
|
||||
// whisper_state * state = nullptr;
|
||||
public PointerByReference vocab;
|
||||
public PointerByReference state;
|
||||
|
||||
/** populated by whisper_init_from_file_with_params() */
|
||||
String path_model;
|
||||
WhisperContextParams params;
|
||||
|
||||
// public static class ByReference extends WhisperContext implements Structure.ByReference {
|
||||
// }
|
||||
//
|
||||
// public static class ByValue extends WhisperContext implements Structure.ByValue {
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// protected List<String> getFieldOrder() {
|
||||
// return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
|
||||
// }
|
||||
}
|
@ -1,207 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.bean.WhisperSegment;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Before calling most methods, you must call `initContext(modelPath)` to initialise the `ctx` Pointer.
|
||||
*/
|
||||
public class WhisperCpp implements AutoCloseable {
|
||||
private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance;
|
||||
private Pointer ctx = null;
|
||||
private Pointer paramsPointer = null;
|
||||
private Pointer greedyParamsPointer = null;
|
||||
private Pointer beamParamsPointer = null;
|
||||
|
||||
public File modelDir() {
|
||||
String modelDirPath = System.getenv("XDG_CACHE_HOME");
|
||||
if (modelDirPath == null) {
|
||||
modelDirPath = System.getProperty("user.home") + "/.cache";
|
||||
}
|
||||
|
||||
return new File(modelDirPath, "whisper");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
||||
*/
|
||||
public void initContext(String modelPath) throws FileNotFoundException {
|
||||
initContextImpl(modelPath, getContextDefaultParams());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
||||
* @param params - params to use when initialising the context
|
||||
*/
|
||||
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
initContextImpl(modelPath, params);
|
||||
}
|
||||
|
||||
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
if (ctx != null) {
|
||||
lib.whisper_free(ctx);
|
||||
}
|
||||
|
||||
if (!modelPath.contains("/") && !modelPath.contains("\\")) {
|
||||
if (!modelPath.endsWith(".bin")) {
|
||||
modelPath = "ggml-" + modelPath.replace("-", ".") + ".bin";
|
||||
}
|
||||
|
||||
modelPath = new File(modelDir(), modelPath).getAbsolutePath();
|
||||
}
|
||||
|
||||
ctx = lib.whisper_init_from_file_with_params(modelPath, params);
|
||||
|
||||
if (ctx == null) {
|
||||
throw new FileNotFoundException(modelPath);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_context_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*/
|
||||
public WhisperContextParams getContextDefaultParams() {
|
||||
paramsPointer = lib.whisper_context_default_params_by_ref();
|
||||
WhisperContextParams params = new WhisperContextParams(paramsPointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_full()` etc.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*
|
||||
* @param strategy - GREEDY
|
||||
*/
|
||||
public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
||||
Pointer pointer;
|
||||
|
||||
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
|
||||
if (strategy == WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY) {
|
||||
if (greedyParamsPointer == null) {
|
||||
greedyParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
|
||||
}
|
||||
pointer = greedyParamsPointer;
|
||||
} else {
|
||||
if (beamParamsPointer == null) {
|
||||
beamParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
|
||||
}
|
||||
pointer = beamParamsPointer;
|
||||
}
|
||||
|
||||
WhisperFullParams params = new WhisperFullParams(pointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
freeContext();
|
||||
freeParams();
|
||||
System.out.println("Whisper closed");
|
||||
}
|
||||
|
||||
private void freeContext() {
|
||||
if (ctx != null) {
|
||||
lib.whisper_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
private void freeParams() {
|
||||
if (paramsPointer != null) {
|
||||
Native.free(Pointer.nativeValue(paramsPointer));
|
||||
paramsPointer = null;
|
||||
}
|
||||
if (greedyParamsPointer != null) {
|
||||
Native.free(Pointer.nativeValue(greedyParamsPointer));
|
||||
greedyParamsPointer = null;
|
||||
}
|
||||
if (beamParamsPointer != null) {
|
||||
Native.free(Pointer.nativeValue(beamParamsPointer));
|
||||
beamParamsPointer = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
|
||||
int nSegments = lib.whisper_full_n_segments(ctx);
|
||||
|
||||
StringBuilder str = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
String text = lib.whisper_full_get_segment_text(ctx, i);
|
||||
System.out.println("Segment:" + text);
|
||||
str.append(text);
|
||||
}
|
||||
|
||||
return str.toString().trim();
|
||||
}
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
|
||||
int nSegments = lib.whisper_full_n_segments(ctx);
|
||||
List<WhisperSegment> segments= new ArrayList<>(nSegments);
|
||||
|
||||
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
long t0 = lib.whisper_full_get_segment_t0(ctx, i);
|
||||
String text = lib.whisper_full_get_segment_text(ctx, i);
|
||||
long t1 = lib.whisper_full_get_segment_t1(ctx, i);
|
||||
segments.add(new WhisperSegment(t0,t1,text));
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
// public int getTextSegmentCount(Pointer ctx) {
|
||||
// return lib.whisper_full_n_segments(ctx);
|
||||
// }
|
||||
// public String getTextSegment(Pointer ctx, int index) {
|
||||
// return lib.whisper_full_get_segment_text(ctx, index);
|
||||
// }
|
||||
|
||||
public String getSystemInfo() {
|
||||
return lib.whisper_print_system_info();
|
||||
}
|
||||
|
||||
public int benchMemcpy(int nthread) {
|
||||
return lib.whisper_bench_memcpy(nthread);
|
||||
}
|
||||
|
||||
public int benchGgmlMulMat(int nthread) {
|
||||
return lib.whisper_bench_ggml_mul_mat(nthread);
|
||||
}
|
||||
}
|
@ -1,388 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import com.sun.jna.Library;
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperModelLoader;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperTokenData;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
|
||||
public interface WhisperCppJnaLibrary extends Library {
|
||||
WhisperCppJnaLibrary instance = Native.load("whisper", WhisperCppJnaLibrary.class);
|
||||
|
||||
String whisper_print_system_info();
|
||||
|
||||
/**
|
||||
* DEPRECATED. Allocate (almost) all memory needed for the model by loading from a file.
|
||||
*
|
||||
* @param path_model Path to the model file
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_file(String path_model);
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_context_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*/
|
||||
Pointer whisper_context_default_params_by_ref();
|
||||
|
||||
void whisper_free_context_params(Pointer params);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a file.
|
||||
*
|
||||
* @param path_model Path to the model file
|
||||
* @param params Pointer to whisper_context_params
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a buffer.
|
||||
*
|
||||
* @param buffer Model buffer
|
||||
* @param buffer_size Size of the model buffer
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_buffer(Pointer buffer, int buffer_size);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model using a model loader.
|
||||
*
|
||||
* @param loader Model loader
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init(WhisperModelLoader loader);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a file without allocating the state.
|
||||
*
|
||||
* @param path_model Path to the model file
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_file_no_state(String path_model);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a buffer without allocating the state.
|
||||
*
|
||||
* @param buffer Model buffer
|
||||
* @param buffer_size Size of the model buffer
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_buffer_no_state(Pointer buffer, int buffer_size);
|
||||
|
||||
// Pointer whisper_init_from_buffer_no_state(Pointer buffer, long buffer_size);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model using a model loader without allocating the state.
|
||||
*
|
||||
* @param loader Model loader
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_no_state(WhisperModelLoader loader);
|
||||
|
||||
/**
|
||||
* Allocate memory for the Whisper state.
|
||||
*
|
||||
* @param ctx Whisper context
|
||||
* @return Whisper state on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_state(Pointer ctx);
|
||||
|
||||
/**
|
||||
* Free all allocated memory associated with the Whisper context.
|
||||
*
|
||||
* @param ctx Whisper context
|
||||
*/
|
||||
void whisper_free(Pointer ctx);
|
||||
|
||||
/**
|
||||
* Free all allocated memory associated with the Whisper state.
|
||||
*
|
||||
* @param state Whisper state
|
||||
*/
|
||||
void whisper_free_state(Pointer state);
|
||||
|
||||
|
||||
/**
|
||||
* Convert RAW PCM audio to log mel spectrogram.
|
||||
* The resulting spectrogram is stored inside the default state of the provided whisper context.
|
||||
*
|
||||
* @param ctx - Pointer to a WhisperContext
|
||||
* @return 0 on success
|
||||
*/
|
||||
int whisper_pcm_to_mel(Pointer ctx, final float[] samples, int n_samples, int n_threads);
|
||||
|
||||
/**
|
||||
* @param ctx Pointer to a WhisperContext
|
||||
* @param state Pointer to WhisperState
|
||||
* @param n_samples
|
||||
* @param n_threads
|
||||
* @return 0 on success
|
||||
*/
|
||||
int whisper_pcm_to_mel_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
|
||||
|
||||
/**
|
||||
* This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
||||
* Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||
* n_mel must be 80
|
||||
* @return 0 on success
|
||||
*/
|
||||
int whisper_set_mel(Pointer ctx, final float[] data, int n_len, int n_mel);
|
||||
int whisper_set_mel_with_state(Pointer ctx, Pointer state, final float[] data, int n_len, int n_mel);
|
||||
|
||||
/**
|
||||
* Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
|
||||
* Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||
* Offset can be used to specify the offset of the first frame in the spectrogram.
|
||||
* @return 0 on success
|
||||
*/
|
||||
int whisper_encode(Pointer ctx, int offset, int n_threads);
|
||||
|
||||
int whisper_encode_with_state(Pointer ctx, Pointer state, int offset, int n_threads);
|
||||
|
||||
/**
|
||||
* Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
||||
* Make sure to call whisper_encode() first.
|
||||
* tokens + n_tokens is the provided context for the decoder.
|
||||
* n_past is the number of tokens to use from previous decoder calls.
|
||||
* Returns 0 on success
|
||||
* TODO: add support for multiple decoders
|
||||
*/
|
||||
int whisper_decode(Pointer ctx, Pointer tokens, int n_tokens, int n_past, int n_threads);
|
||||
|
||||
/**
|
||||
* @param ctx
|
||||
* @param state
|
||||
* @param tokens Pointer to int tokens
|
||||
* @param n_tokens
|
||||
* @param n_past
|
||||
* @param n_threads
|
||||
* @return
|
||||
*/
|
||||
int whisper_decode_with_state(Pointer ctx, Pointer state, Pointer tokens, int n_tokens, int n_past, int n_threads);
|
||||
|
||||
/**
|
||||
* Convert the provided text into tokens.
|
||||
* The tokens pointer must be large enough to hold the resulting tokens.
|
||||
* Returns the number of tokens on success, no more than n_max_tokens
|
||||
* Returns -1 on failure
|
||||
* TODO: not sure if correct
|
||||
*/
|
||||
int whisper_tokenize(Pointer ctx, String text, Pointer tokens, int n_max_tokens);
|
||||
|
||||
/** Largest language id (i.e. number of available languages - 1) */
|
||||
int whisper_lang_max_id();
|
||||
|
||||
/**
|
||||
* @return the id of the specified language, returns -1 if not found.
|
||||
* Examples:
|
||||
* "de" -> 2
|
||||
* "german" -> 2
|
||||
*/
|
||||
int whisper_lang_id(String lang);
|
||||
|
||||
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
||||
String whisper_lang_str(int id);
|
||||
|
||||
/**
|
||||
* Use mel data at offset_ms to try and auto-detect the spoken language.
|
||||
* Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
||||
* Returns the top language id or negative on failure
|
||||
* If not null, fills the lang_probs array with the probabilities of all languages
|
||||
* The array must be whisper_lang_max_id() + 1 in size
|
||||
*
|
||||
* ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
||||
*/
|
||||
int whisper_lang_auto_detect(Pointer ctx, int offset_ms, int n_threads, float[] lang_probs);
|
||||
|
||||
int whisper_lang_auto_detect_with_state(Pointer ctx, Pointer state, int offset_ms, int n_threads, float[] lang_probs);
|
||||
|
||||
int whisper_n_len (Pointer ctx); // mel length
|
||||
int whisper_n_len_from_state(Pointer state); // mel length
|
||||
int whisper_n_vocab (Pointer ctx);
|
||||
int whisper_n_text_ctx (Pointer ctx);
|
||||
int whisper_n_audio_ctx (Pointer ctx);
|
||||
int whisper_is_multilingual (Pointer ctx);
|
||||
|
||||
int whisper_model_n_vocab (Pointer ctx);
|
||||
int whisper_model_n_audio_ctx (Pointer ctx);
|
||||
int whisper_model_n_audio_state(Pointer ctx);
|
||||
int whisper_model_n_audio_head (Pointer ctx);
|
||||
int whisper_model_n_audio_layer(Pointer ctx);
|
||||
int whisper_model_n_text_ctx (Pointer ctx);
|
||||
int whisper_model_n_text_state (Pointer ctx);
|
||||
int whisper_model_n_text_head (Pointer ctx);
|
||||
int whisper_model_n_text_layer (Pointer ctx);
|
||||
int whisper_model_n_mels (Pointer ctx);
|
||||
int whisper_model_ftype (Pointer ctx);
|
||||
int whisper_model_type (Pointer ctx);
|
||||
|
||||
/**
|
||||
* Token logits obtained from the last call to whisper_decode().
|
||||
* The logits for the last token are stored in the last row
|
||||
* Rows: n_tokens
|
||||
* Cols: n_vocab
|
||||
*/
|
||||
float[] whisper_get_logits (Pointer ctx);
|
||||
float[] whisper_get_logits_from_state(Pointer state);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
String whisper_token_to_str(Pointer ctx, int token);
|
||||
String whisper_model_type_readable(Pointer ctx);
|
||||
|
||||
// Special tokens
|
||||
int whisper_token_eot (Pointer ctx);
|
||||
int whisper_token_sot (Pointer ctx);
|
||||
int whisper_token_prev(Pointer ctx);
|
||||
int whisper_token_solm(Pointer ctx);
|
||||
int whisper_token_not (Pointer ctx);
|
||||
int whisper_token_beg (Pointer ctx);
|
||||
int whisper_token_lang(Pointer ctx, int lang_id);
|
||||
|
||||
// Task tokens
|
||||
int whisper_token_translate (Pointer ctx);
|
||||
int whisper_token_transcribe(Pointer ctx);
|
||||
|
||||
// Performance information from the default state.
|
||||
void whisper_print_timings(Pointer ctx);
|
||||
void whisper_reset_timings(Pointer ctx);
|
||||
|
||||
// Note: Even if `whisper_full_params is stripped back to just 4 ints, JNA throws "Invalid memory access"
|
||||
// when `whisper_full_default_params()` tries to return a struct.
|
||||
// WhisperFullParams whisper_full_default_params(int strategy);
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_full()` etc.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*
|
||||
* @param strategy - WhisperSamplingStrategy.value
|
||||
*/
|
||||
Pointer whisper_full_default_params_by_ref(int strategy);
|
||||
|
||||
void whisper_free_params(Pointer params);
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
// Result is stored in the default state of the context
|
||||
// Not thread safe if executed in parallel on the same context.
|
||||
// It seems this approach can offer some speedup in some cases.
|
||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
|
||||
|
||||
/**
|
||||
* Number of generated text segments.
|
||||
* A segment can be a few words, a sentence, or even a paragraph.
|
||||
* @param ctx Pointer to WhisperContext
|
||||
*/
|
||||
int whisper_full_n_segments (Pointer ctx);
|
||||
|
||||
/**
|
||||
* @param state Pointer to WhisperState
|
||||
*/
|
||||
int whisper_full_n_segments_from_state(Pointer state);
|
||||
|
||||
/**
|
||||
* Language id associated with the context's default state.
|
||||
* @param ctx Pointer to WhisperContext
|
||||
*/
|
||||
int whisper_full_lang_id(Pointer ctx);
|
||||
|
||||
/** Language id associated with the provided state */
|
||||
int whisper_full_lang_id_from_state(Pointer state);
|
||||
|
||||
|
||||
/** Get the start time of the specified segment. */
|
||||
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
|
||||
|
||||
/** Get the start time of the specified segment from the state. */
|
||||
long whisper_full_get_segment_t0_from_state(Pointer state, int i_segment);
|
||||
|
||||
/** Get the end time of the specified segment. */
|
||||
long whisper_full_get_segment_t1(Pointer ctx, int i_segment);
|
||||
|
||||
/** Get the end time of the specified segment from the state. */
|
||||
long whisper_full_get_segment_t1_from_state(Pointer state, int i_segment);
|
||||
|
||||
/** Get the text of the specified segment. */
|
||||
String whisper_full_get_segment_text(Pointer ctx, int i_segment);
|
||||
|
||||
/** Get the text of the specified segment from the state. */
|
||||
String whisper_full_get_segment_text_from_state(Pointer state, int i_segment);
|
||||
|
||||
/** Get the number of tokens in the specified segment. */
|
||||
int whisper_full_n_tokens(Pointer ctx, int i_segment);
|
||||
|
||||
/** Get the number of tokens in the specified segment from the state. */
|
||||
int whisper_full_n_tokens_from_state(Pointer state, int i_segment);
|
||||
|
||||
/** Get the token text of the specified token in the specified segment. */
|
||||
String whisper_full_get_token_text(Pointer ctx, int i_segment, int i_token);
|
||||
|
||||
|
||||
/** Get the token text of the specified token in the specified segment from the state. */
|
||||
String whisper_full_get_token_text_from_state(Pointer ctx, Pointer state, int i_segment, int i_token);
|
||||
|
||||
/** Get the token ID of the specified token in the specified segment. */
|
||||
int whisper_full_get_token_id(Pointer ctx, int i_segment, int i_token);
|
||||
|
||||
/** Get the token ID of the specified token in the specified segment from the state. */
|
||||
int whisper_full_get_token_id_from_state(Pointer state, int i_segment, int i_token);
|
||||
|
||||
/** Get token data for the specified token in the specified segment. */
|
||||
WhisperTokenData whisper_full_get_token_data(Pointer ctx, int i_segment, int i_token);
|
||||
|
||||
/** Get token data for the specified token in the specified segment from the state. */
|
||||
WhisperTokenData whisper_full_get_token_data_from_state(Pointer state, int i_segment, int i_token);
|
||||
|
||||
/** Get the probability of the specified token in the specified segment. */
|
||||
float whisper_full_get_token_p(Pointer ctx, int i_segment, int i_token);
|
||||
|
||||
/** Get the probability of the specified token in the specified segment from the state. */
|
||||
float whisper_full_get_token_p_from_state(Pointer state, int i_segment, int i_token);
|
||||
|
||||
/**
|
||||
* Benchmark function for memcpy.
|
||||
*
|
||||
* @param nThreads Number of threads to use for the benchmark.
|
||||
* @return The result of the benchmark.
|
||||
*/
|
||||
int whisper_bench_memcpy(int nThreads);
|
||||
|
||||
/**
|
||||
* Benchmark function for memcpy as a string.
|
||||
*
|
||||
* @param nThreads Number of threads to use for the benchmark.
|
||||
* @return The result of the benchmark as a string.
|
||||
*/
|
||||
String whisper_bench_memcpy_str(int nThreads);
|
||||
|
||||
/**
|
||||
* Benchmark function for ggml_mul_mat.
|
||||
*
|
||||
* @param nThreads Number of threads to use for the benchmark.
|
||||
* @return The result of the benchmark.
|
||||
*/
|
||||
int whisper_bench_ggml_mul_mat(int nThreads);
|
||||
|
||||
/**
|
||||
* Benchmark function for ggml_mul_mat as a string.
|
||||
*
|
||||
* @param nThreads Number of threads to use for the benchmark.
|
||||
* @return The result of the benchmark as a string.
|
||||
*/
|
||||
String whisper_bench_ggml_mul_mat_str(int nThreads);
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.bean;
|
||||
|
||||
/**
|
||||
* Created by litonglinux@qq.com on 10/21/2023_7:48 AM
|
||||
*/
|
||||
public class WhisperSegment {
|
||||
private long start, end;
|
||||
private String sentence;
|
||||
|
||||
public WhisperSegment() {
|
||||
}
|
||||
|
||||
public WhisperSegment(long start, long end, String sentence) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.sentence = sentence;
|
||||
}
|
||||
|
||||
public long getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public long getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public String getSentence() {
|
||||
return sentence;
|
||||
}
|
||||
|
||||
public void setStart(long start) {
|
||||
this.start = start;
|
||||
}
|
||||
|
||||
public void setEnd(long end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public void setSentence(String sentence) {
|
||||
this.sentence = sentence;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[" + start + " --> " + end + "]:" + sentence;
|
||||
}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.WhisperContext;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperState;
|
||||
|
||||
/**
|
||||
* Callback before the encoder starts.
|
||||
* If not null, called before the encoder starts.
|
||||
* If it returns false, the computation is aborted.
|
||||
*/
|
||||
public interface WhisperEncoderBeginCallback extends Callback {
|
||||
|
||||
/**
|
||||
* Callback method before the encoder starts.
|
||||
*
|
||||
* @param ctx The whisper context.
|
||||
* @param state The whisper state.
|
||||
* @param user_data User data.
|
||||
* @return True if the computation should proceed, false otherwise.
|
||||
*/
|
||||
boolean callback(Pointer ctx, Pointer state, Pointer user_data);
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperTokenData;
|
||||
|
||||
/**
|
||||
* Callback to filter logits.
|
||||
* Can be used to modify the logits before sampling.
|
||||
* If not null, called after applying temperature to logits.
|
||||
*/
|
||||
public interface WhisperLogitsFilterCallback extends Callback {
|
||||
|
||||
/**
|
||||
* Callback method to filter logits.
|
||||
*
|
||||
* @param ctx The whisper context.
|
||||
* @param state The whisper state.
|
||||
* @param tokens The array of whisper_token_data.
|
||||
* @param n_tokens The number of tokens.
|
||||
* @param logits The array of logits.
|
||||
* @param user_data User data.
|
||||
*/
|
||||
void callback(Pointer ctx, Pointer state, WhisperTokenData[] tokens, int n_tokens, float[] logits, Pointer user_data);
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.WhisperContext;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperState;
|
||||
|
||||
/**
|
||||
* Callback for the text segment.
|
||||
* Called on every newly generated text segment.
|
||||
* Use the whisper_full_...() functions to obtain the text segments.
|
||||
*/
|
||||
public interface WhisperNewSegmentCallback extends Callback {
|
||||
|
||||
/**
|
||||
* Callback method for the text segment.
|
||||
*
|
||||
* @param ctx The whisper context.
|
||||
* @param state The whisper state.
|
||||
* @param n_new The number of newly generated text segments.
|
||||
* @param user_data User data.
|
||||
*/
|
||||
void callback(Pointer ctx, Pointer state, int n_new, Pointer user_data);
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.WhisperContext;
|
||||
import io.github.ggerganov.whispercpp.model.WhisperState;
|
||||
|
||||
/**
|
||||
* Callback for progress updates.
|
||||
*/
|
||||
public interface WhisperProgressCallback extends Callback {
|
||||
|
||||
/**
|
||||
* Callback method for progress updates.
|
||||
*
|
||||
* @param ctx The whisper context.
|
||||
* @param state The whisper state.
|
||||
* @param progress The progress value.
|
||||
* @param user_data User data.
|
||||
*/
|
||||
void callback(Pointer ctx, Pointer state, int progress, Pointer user_data);
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.ggml;
|
||||
|
||||
public class GgmlTensor {
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.ggml;
|
||||
|
||||
public enum GgmlType {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
REMOVED_GGML_TYPE_Q4_2, // support has been removed
|
||||
REMOVED_GGML_TYPE_Q4_3, // support has been removed
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q8_1,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_COUNT,
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.model;
|
||||
|
||||
public enum EModel {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_TINY,
|
||||
MODEL_BASE,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import io.github.ggerganov.whispercpp.ggml.GgmlTensor;
|
||||
import io.github.ggerganov.whispercpp.model.EModel;
|
||||
|
||||
public class WhisperModel {
|
||||
// EModel type = EModel.MODEL_UNKNOWN;
|
||||
//
|
||||
// WhisperHParams hparams;
|
||||
// WhisperFilters filters;
|
||||
//
|
||||
// // encoder.positional_embedding
|
||||
// GgmlTensor e_pe;
|
||||
//
|
||||
// // encoder.conv1
|
||||
// GgmlTensor e_conv_1_w;
|
||||
// GgmlTensor e_conv_1_b;
|
||||
//
|
||||
// // encoder.conv2
|
||||
// GgmlTensor e_conv_2_w;
|
||||
// GgmlTensor e_conv_2_b;
|
||||
//
|
||||
// // encoder.ln_post
|
||||
// GgmlTensor e_ln_w;
|
||||
// GgmlTensor e_ln_b;
|
||||
//
|
||||
// // decoder.positional_embedding
|
||||
// GgmlTensor d_pe;
|
||||
//
|
||||
// // decoder.token_embedding
|
||||
// GgmlTensor d_te;
|
||||
//
|
||||
// // decoder.ln
|
||||
// GgmlTensor d_ln_w;
|
||||
// GgmlTensor d_ln_b;
|
||||
//
|
||||
// std::vector<whisper_layer_encoder> layers_encoder;
|
||||
// std::vector<whisper_layer_decoder> layers_decoder;
|
||||
//
|
||||
// // context
|
||||
// struct ggml_context * ctx;
|
||||
//
|
||||
// // the model memory buffer is read-only and can be shared between processors
|
||||
// std::vector<uint8_t> * buf;
|
||||
//
|
||||
// // tensors
|
||||
// int n_loaded;
|
||||
// Map<String, GgmlTensor> tensors;
|
||||
}
|
@ -1,62 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.model;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
import com.sun.jna.Pointer;
|
||||
import com.sun.jna.Structure;
|
||||
|
||||
|
||||
public class WhisperModelLoader extends Structure {
|
||||
public Pointer context;
|
||||
public ReadFunction read;
|
||||
public EOFFunction eof;
|
||||
public CloseFunction close;
|
||||
|
||||
public static class ReadFunction implements Callback {
|
||||
public Pointer invoke(Pointer ctx, Pointer output, int readSize) {
|
||||
// TODO
|
||||
return ctx;
|
||||
}
|
||||
}
|
||||
|
||||
public static class EOFFunction implements Callback {
|
||||
public boolean invoke(Pointer ctx) {
|
||||
// TODO
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static class CloseFunction implements Callback {
|
||||
public void invoke(Pointer ctx) {
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
|
||||
// public WhisperModelLoader(Pointer p) {
|
||||
// super(p);
|
||||
// read = new ReadFunction();
|
||||
// eof = new EOFFunction();
|
||||
// close = new CloseFunction();
|
||||
// read.setCallback(this);
|
||||
// eof.setCallback(this);
|
||||
// close.setCallback(this);
|
||||
// read.write();
|
||||
// eof.write();
|
||||
// close.write();
|
||||
// }
|
||||
|
||||
public WhisperModelLoader() {
|
||||
super();
|
||||
}
|
||||
|
||||
public interface ReadCallback extends Callback {
|
||||
Pointer invoke(Pointer ctx, Pointer output, int readSize);
|
||||
}
|
||||
|
||||
public interface EOFCallback extends Callback {
|
||||
boolean invoke(Pointer ctx);
|
||||
}
|
||||
|
||||
public interface CloseCallback extends Callback {
|
||||
void invoke(Pointer ctx);
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.model;
|
||||
|
||||
public class WhisperState {
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.model;
|
||||
|
||||
import com.sun.jna.Structure;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Structure representing token data.
|
||||
*/
|
||||
public class WhisperTokenData extends Structure {
|
||||
|
||||
/** Token ID. */
|
||||
public int id;
|
||||
|
||||
/** Forced timestamp token ID. */
|
||||
public int tid;
|
||||
|
||||
/** Probability of the token. */
|
||||
public float p;
|
||||
|
||||
/** Log probability of the token. */
|
||||
public float plog;
|
||||
|
||||
/** Probability of the timestamp token. */
|
||||
public float pt;
|
||||
|
||||
/** Sum of probabilities of all timestamp tokens. */
|
||||
public float ptsum;
|
||||
|
||||
/**
|
||||
* Start time of the token (token-level timestamp data).
|
||||
* Do not use if you haven't computed token-level timestamps.
|
||||
*/
|
||||
public long t0;
|
||||
|
||||
/**
|
||||
* End time of the token (token-level timestamp data).
|
||||
* Do not use if you haven't computed token-level timestamps.
|
||||
*/
|
||||
public long t1;
|
||||
|
||||
/** Voice length of the token. */
|
||||
public float vlen;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("id", "tid", "p", "plog", "pt", "ptsum", "t0", "t1", "vlen");
|
||||
}
|
||||
}
|
@ -1,19 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.Structure;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class BeamSearchParams extends Structure {
|
||||
/** ref: <a href="https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265">...</a> */
|
||||
public int beam_size;
|
||||
|
||||
/** ref: <a href="https://arxiv.org/pdf/2204.05424.pdf">...</a> */
|
||||
public float patience;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("beam_size", "patience");
|
||||
}
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.IntegerType;
|
||||
|
||||
import java.util.function.BooleanSupplier;
|
||||
|
||||
public class CBool extends IntegerType implements BooleanSupplier {
|
||||
public static final int SIZE = 1;
|
||||
public static final CBool FALSE = new CBool(0);
|
||||
public static final CBool TRUE = new CBool(1);
|
||||
|
||||
|
||||
public CBool() {
|
||||
this(0);
|
||||
}
|
||||
|
||||
public CBool(long value) {
|
||||
super(SIZE, value, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getAsBoolean() {
|
||||
return intValue() == 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return intValue() == 1 ? "true" : "false";
|
||||
}
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.Structure;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class GreedyParams extends Structure {
|
||||
/** <a href="https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264">...</a> */
|
||||
public int best_of;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Collections.singletonList("best_of");
|
||||
}
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.*;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Parameters for the whisper_init_from_file_with_params() function.
|
||||
* If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
||||
* whisper_context_default_params()
|
||||
*/
|
||||
public class WhisperContextParams extends Structure {
|
||||
|
||||
public WhisperContextParams(Pointer p) {
|
||||
super(p);
|
||||
}
|
||||
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public CBool use_gpu;
|
||||
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public void useGpu(boolean enable) {
|
||||
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("use_gpu");
|
||||
}
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperFilters {
|
||||
int n_mel;
|
||||
int n_fft;
|
||||
|
||||
List<Float> data;
|
||||
}
|
@ -1,326 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.*;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Parameters for the whisper_full() function.
|
||||
* If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
||||
* whisper_full_default_params()
|
||||
*/
|
||||
public class WhisperFullParams extends Structure {
|
||||
|
||||
public WhisperFullParams(Pointer p) {
|
||||
super(p);
|
||||
// super(p, ALIGN_MSVC);
|
||||
// super(p, ALIGN_GNUC);
|
||||
}
|
||||
|
||||
/** Sampling strategy for whisper_full() function. */
|
||||
public int strategy;
|
||||
|
||||
/** Number of threads. (default = 4) */
|
||||
public int n_threads;
|
||||
|
||||
/** Maximum tokens to use from past text as a prompt for the decoder. (default = 16384) */
|
||||
public int n_max_text_ctx;
|
||||
|
||||
/** Start offset in milliseconds. (default = 0) */
|
||||
public int offset_ms;
|
||||
|
||||
/** Audio duration to process in milliseconds. (default = 0) */
|
||||
public int duration_ms;
|
||||
|
||||
/** Translate flag. (default = false) */
|
||||
public CBool translate;
|
||||
|
||||
/** The compliment of translateMode() */
|
||||
public void transcribeMode() {
|
||||
translate = CBool.FALSE;
|
||||
}
|
||||
|
||||
/** The compliment of transcribeMode() */
|
||||
public void translateMode() {
|
||||
translate = CBool.TRUE;
|
||||
}
|
||||
|
||||
/** Flag to indicate whether to use past transcription (if any) as an initial prompt for the decoder. (default = true) */
|
||||
public CBool no_context;
|
||||
|
||||
/** Flag to indicate whether to use past transcription (if any) as an initial prompt for the decoder. (default = true) */
|
||||
public void enableContext(boolean enable) {
|
||||
no_context = enable ? CBool.FALSE : CBool.TRUE;
|
||||
}
|
||||
|
||||
/** Generate timestamps or not? */
|
||||
public CBool no_timestamps;
|
||||
|
||||
/** Flag to force single segment output (useful for streaming). (default = false) */
|
||||
public CBool single_segment;
|
||||
|
||||
/** Flag to force single segment output (useful for streaming). (default = false) */
|
||||
public void singleSegment(boolean single) {
|
||||
single_segment = single ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public CBool print_special;
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public void printSpecial(boolean enable) {
|
||||
print_special = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print progress information. (default = true) */
|
||||
public CBool print_progress;
|
||||
|
||||
/** Flag to print progress information. (default = true) */
|
||||
public void printProgress(boolean enable) {
|
||||
print_progress = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print results from within whisper.cpp (avoid it, use callback instead). (default = true) */
|
||||
public CBool print_realtime;
|
||||
|
||||
/** Flag to print results from within whisper.cpp (avoid it, use callback instead). (default = true) */
|
||||
public void printRealtime(boolean enable) {
|
||||
print_realtime = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print timestamps for each text segment when printing realtime. (default = true) */
|
||||
public CBool print_timestamps;
|
||||
|
||||
/** Flag to print timestamps for each text segment when printing realtime. (default = true) */
|
||||
public void printTimestamps(boolean enable) {
|
||||
print_timestamps = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** [EXPERIMENTAL] Flag to enable token-level timestamps. (default = false) */
|
||||
public CBool token_timestamps;
|
||||
|
||||
/** [EXPERIMENTAL] Flag to enable token-level timestamps. (default = false) */
|
||||
public void tokenTimestamps(boolean enable) {
|
||||
token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** [EXPERIMENTAL] Timestamp token probability threshold (~0.01). (default = 0.01) */
|
||||
public float thold_pt;
|
||||
|
||||
/** [EXPERIMENTAL] Timestamp token sum probability threshold (~0.01). */
|
||||
public float thold_ptsum;
|
||||
|
||||
/** Maximum segment length in characters. (default = 0) */
|
||||
public int max_len;
|
||||
|
||||
/** Flag to split on word rather than on token (when used with max_len). (default = false) */
|
||||
public CBool split_on_word;
|
||||
|
||||
/** Flag to split on word rather than on token (when used with max_len). (default = false) */
|
||||
public void splitOnWord(boolean enable) {
|
||||
split_on_word = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Maximum tokens per segment (0, default = no limit) */
|
||||
public int max_tokens;
|
||||
|
||||
/** Overwrite the audio context size (0 = use default). */
|
||||
public int audio_ctx;
|
||||
|
||||
/** Enable tinydiarize (default = false) */
|
||||
public CBool tdrz_enable;
|
||||
|
||||
/** Enable tinydiarize (default = false) */
|
||||
public void tdrzEnable(boolean enable) {
|
||||
tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Regular expression matching tokens to suppress. */
|
||||
public String suppress_regex;
|
||||
|
||||
/** Tokens to provide to the whisper decoder as an initial prompt.
|
||||
* These are prepended to any existing text context from a previous call. */
|
||||
public String initial_prompt;
|
||||
|
||||
/** Prompt tokens. (int*) */
|
||||
public Pointer prompt_tokens;
|
||||
|
||||
public void setPromptTokens(int[] tokens) {
|
||||
Memory mem = new Memory(tokens.length * 4L);
|
||||
mem.write(0, tokens, 0, tokens.length);
|
||||
prompt_tokens = mem;
|
||||
}
|
||||
|
||||
/** Number of prompt tokens. */
|
||||
public int prompt_n_tokens;
|
||||
|
||||
/** Language for auto-detection.
|
||||
* For auto-detection, set to `null`, `""`, or "auto". */
|
||||
public String language;
|
||||
|
||||
/** Flag to indicate whether to detect language automatically. */
|
||||
public CBool detect_language;
|
||||
|
||||
/** Flag to indicate whether to detect language automatically. */
|
||||
public void detectLanguage(boolean enable) {
|
||||
detect_language = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
// Common decoding parameters.
|
||||
|
||||
/** Flag to suppress blank tokens. */
|
||||
public CBool suppress_blank;
|
||||
|
||||
public void suppressBlanks(boolean enable) {
|
||||
suppress_blank = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to suppress non-speech tokens. */
|
||||
public CBool suppress_nst;
|
||||
|
||||
/** Flag to suppress non-speech tokens. */
|
||||
public void suppressNonSpeechTokens(boolean enable) {
|
||||
suppress_nst = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Initial decoding temperature. */
|
||||
public float temperature;
|
||||
|
||||
/** Maximum initial timestamp. */
|
||||
public float max_initial_ts;
|
||||
|
||||
/** Length penalty. */
|
||||
public float length_penalty;
|
||||
|
||||
// Fallback parameters.
|
||||
|
||||
/** Temperature increment. */
|
||||
public float temperature_inc;
|
||||
|
||||
/** Entropy threshold (similar to OpenAI's "compression_ratio_threshold"). */
|
||||
public float entropy_thold;
|
||||
|
||||
/** Log probability threshold. */
|
||||
public float logprob_thold;
|
||||
|
||||
/** No speech threshold. */
|
||||
public float no_speech_thold;
|
||||
|
||||
/** Greedy decoding parameters. */
|
||||
public GreedyParams greedy;
|
||||
|
||||
/**
|
||||
* Beam search decoding parameters.
|
||||
*/
|
||||
public BeamSearchParams beam_search;
|
||||
|
||||
public void setBestOf(int bestOf) {
|
||||
if (greedy == null) {
|
||||
greedy = new GreedyParams();
|
||||
}
|
||||
greedy.best_of = bestOf;
|
||||
}
|
||||
|
||||
public void setBeamSize(int beamSize) {
|
||||
if (beam_search == null) {
|
||||
beam_search = new BeamSearchParams();
|
||||
}
|
||||
beam_search.beam_size = beamSize;
|
||||
}
|
||||
|
||||
public void setBeamSizeAndPatience(int beamSize, float patience) {
|
||||
if (beam_search == null) {
|
||||
beam_search = new BeamSearchParams();
|
||||
}
|
||||
beam_search.beam_size = beamSize;
|
||||
beam_search.patience = patience;
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback for every newly generated text segment.
|
||||
* WhisperNewSegmentCallback
|
||||
*/
|
||||
public Pointer new_segment_callback;
|
||||
|
||||
/**
|
||||
* User data for the new_segment_callback.
|
||||
*/
|
||||
public Pointer new_segment_callback_user_data;
|
||||
|
||||
/**
|
||||
* Callback on each progress update.
|
||||
* WhisperProgressCallback
|
||||
*/
|
||||
public Pointer progress_callback;
|
||||
|
||||
/**
|
||||
* User data for the progress_callback.
|
||||
*/
|
||||
public Pointer progress_callback_user_data;
|
||||
|
||||
/**
|
||||
* Callback each time before the encoder starts.
|
||||
* WhisperEncoderBeginCallback
|
||||
*/
|
||||
public Pointer encoder_begin_callback;
|
||||
|
||||
/**
|
||||
* User data for the encoder_begin_callback.
|
||||
*/
|
||||
public Pointer encoder_begin_callback_user_data;
|
||||
|
||||
/**
|
||||
* Callback by each decoder to filter obtained logits.
|
||||
* WhisperLogitsFilterCallback
|
||||
*/
|
||||
public Pointer logits_filter_callback;
|
||||
|
||||
/**
|
||||
* User data for the logits_filter_callback.
|
||||
*/
|
||||
public Pointer logits_filter_callback_user_data;
|
||||
|
||||
|
||||
public void setNewSegmentCallback(WhisperNewSegmentCallback callback) {
|
||||
new_segment_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
public void setProgressCallback(WhisperProgressCallback callback) {
|
||||
progress_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
public void setEncoderBeginCallbackeginCallbackCallback(WhisperEncoderBeginCallback callback) {
|
||||
encoder_begin_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
|
||||
logits_filter_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
/** Grammar stuff */
|
||||
public Pointer grammar_rules;
|
||||
public long n_grammar_rules;
|
||||
public long i_start_rule;
|
||||
public float grammar_penalty;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
|
||||
"no_context", "single_segment", "no_timestamps",
|
||||
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
||||
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",
|
||||
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
||||
"new_segment_callback", "new_segment_callback_user_data",
|
||||
"progress_callback", "progress_callback_user_data",
|
||||
"encoder_begin_callback", "encoder_begin_callback_user_data",
|
||||
"logits_filter_callback", "logits_filter_callback_user_data",
|
||||
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
public class WhisperHParams {
|
||||
int n_vocab = 51864;
|
||||
int n_audio_ctx = 1500;
|
||||
int n_audio_state = 384;
|
||||
int n_audio_head = 6;
|
||||
int n_audio_layer = 4;
|
||||
int n_text_ctx = 448;
|
||||
int n_text_state = 384;
|
||||
int n_text_head = 6;
|
||||
int n_text_layer = 4;
|
||||
int n_mels = 80;
|
||||
int ftype = 1;
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
/** Available sampling strategies */
|
||||
public enum WhisperSamplingStrategy {
|
||||
/** similar to OpenAI's GreedyDecoder */
|
||||
WHISPER_SAMPLING_GREEDY,
|
||||
|
||||
/** similar to OpenAI's BeamSearchDecoder */
|
||||
WHISPER_SAMPLING_BEAM_SEARCH
|
||||
}
|
@ -1,144 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import io.github.ggerganov.whispercpp.bean.WhisperSegment;
|
||||
import io.github.ggerganov.whispercpp.params.CBool;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import javax.sound.sampled.AudioInputStream;
|
||||
import javax.sound.sampled.AudioSystem;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.List;
|
||||
|
||||
class WhisperCppTest {
|
||||
private static WhisperCpp whisper = new WhisperCpp();
|
||||
private static boolean modelInitialised = false;
|
||||
|
||||
@BeforeAll
|
||||
static void init() throws FileNotFoundException {
|
||||
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
|
||||
// or you can provide the absolute path to the model file.
|
||||
//String modelName = "../../models/ggml-tiny.bin";
|
||||
String modelName = "../../models/ggml-tiny.en.bin";
|
||||
try {
|
||||
whisper.initContext(modelName);
|
||||
//whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
//whisper.getJavaDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
modelInitialised = true;
|
||||
} catch (FileNotFoundException ex) {
|
||||
System.out.println("Model " + modelName + " not found");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGetDefaultFullParams_BeamSearch() {
|
||||
// When
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
|
||||
// Then
|
||||
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal(), params.strategy);
|
||||
assertNotEquals(0, params.n_threads);
|
||||
assertEquals(16384, params.n_max_text_ctx);
|
||||
assertFalse(params.translate);
|
||||
assertEquals(0.01f, params.thold_pt);
|
||||
assertEquals(5, params.beam_search.beam_size);
|
||||
assertEquals(-1.0f, params.beam_search.patience);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGetDefaultFullParams_Greedy() {
|
||||
// When
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
// Then
|
||||
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
|
||||
assertNotEquals(0, params.n_threads);
|
||||
assertEquals(16384, params.n_max_text_ctx);
|
||||
assertEquals(5, params.greedy.best_of);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFullTranscribe() throws Exception {
|
||||
if (!modelInitialised) {
|
||||
System.out.println("Model not initialised, skipping test");
|
||||
return;
|
||||
}
|
||||
|
||||
// Given
|
||||
File file = new File(System.getProperty("user.dir"), "../../samples/jfk.wav");
|
||||
AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(file);
|
||||
|
||||
byte[] b = new byte[audioInputStream.available()];
|
||||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
|
||||
|
||||
try {
|
||||
audioInputStream.read(b);
|
||||
|
||||
for (int i = 0, j = 0; i < b.length; i += 2, j++) {
|
||||
int intSample = (int) (b[i + 1]) << 8 | (int) (b[i]) & 0xFF;
|
||||
floats[j] = intSample / 32767.0f;
|
||||
}
|
||||
|
||||
// When
|
||||
String result = whisper.fullTranscribe(params, floats);
|
||||
|
||||
// Then
|
||||
System.err.println(result);
|
||||
assertEquals("And so my fellow Americans ask not what your country can do for you " +
|
||||
"ask what you can do for your country.",
|
||||
result.replace(",", ""));
|
||||
} finally {
|
||||
audioInputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFullTranscribeWithTime() throws Exception {
|
||||
if (!modelInitialised) {
|
||||
System.out.println("Model not initialised, skipping test");
|
||||
return;
|
||||
}
|
||||
|
||||
// Given
|
||||
File file = new File(System.getProperty("user.dir"), "../../samples/jfk.wav");
|
||||
AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(file);
|
||||
|
||||
byte[] b = new byte[audioInputStream.available()];
|
||||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
|
||||
try {
|
||||
audioInputStream.read(b);
|
||||
|
||||
for (int i = 0, j = 0; i < b.length; i += 2, j++) {
|
||||
int intSample = (int) (b[i + 1]) << 8 | (int) (b[i]) & 0xFF;
|
||||
floats[j] = intSample / 32767.0f;
|
||||
}
|
||||
|
||||
List<WhisperSegment> segments = whisper.fullTranscribeWithTime(params, floats);
|
||||
assertTrue(segments.size() > 0, "The size of segments should be greater than 0");
|
||||
for (WhisperSegment segment : segments) {
|
||||
System.out.println(segment);
|
||||
}
|
||||
} finally {
|
||||
audioInputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class WhisperJnaLibraryTest {
|
||||
|
||||
@Test
|
||||
void testWhisperPrint_system_info() {
|
||||
String systemInfo = WhisperCppJnaLibrary.instance.whisper_print_system_info();
|
||||
// eg: "AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0
|
||||
// | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | "
|
||||
System.out.println("System info: " + systemInfo);
|
||||
assertTrue(systemInfo.length() > 10);
|
||||
}
|
||||
}
|
@ -20,22 +20,15 @@ if (WHISPER_WASM_SINGLE_FILE)
|
||||
${CMAKE_BINARY_DIR}/bin/libwhisper.js
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libwhisper.worker.js
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/libwhisper.worker.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s MODULARIZE=1 \
|
||||
-s EXPORT_NAME=\"'whisper_factory'\" \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s ALLOW_MEMORY_GROWTH=1 \
|
||||
-s INITIAL_MEMORY=1610612736 \
|
||||
-s TOTAL_MEMORY=1610612736 \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
@ -1,78 +0,0 @@
|
||||
# whisper.cpp
|
||||
|
||||
Node.js package for Whisper speech recognition
|
||||
|
||||
Package: https://www.npmjs.com/package/whisper.cpp
|
||||
|
||||
## Details
|
||||
|
||||
The performance is comparable to when running `whisper.cpp` in the browser via WASM.
|
||||
|
||||
The API is currently very rudimentary: [bindings/javascript/emscripten.cpp](/bindings/javascript/emscripten.cpp)
|
||||
|
||||
For sample usage check [tests/test-whisper.js](/tests/test-whisper.js)
|
||||
|
||||
## Package building + test
|
||||
|
||||
```bash
|
||||
# load emscripten
|
||||
source /path/to/emsdk/emsdk_env.sh
|
||||
|
||||
# clone repo
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
|
||||
# grab base.en model
|
||||
./models/download-ggml-model.sh base.en
|
||||
|
||||
# prepare PCM sample for testing
|
||||
ffmpeg -i samples/jfk.wav -f f32le -acodec pcm_f32le samples/jfk.pcmf32
|
||||
|
||||
# build
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake .. && make -j
|
||||
|
||||
# run test
|
||||
node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
# publish npm package
|
||||
make publish-npm
|
||||
```
|
||||
|
||||
## Sample run
|
||||
|
||||
```text
|
||||
$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
whisper_model_load: loading model from 'whisper.bin'
|
||||
whisper_model_load: n_vocab = 51864
|
||||
whisper_model_load: n_audio_ctx = 1500
|
||||
whisper_model_load: n_audio_state = 512
|
||||
whisper_model_load: n_audio_head = 8
|
||||
whisper_model_load: n_audio_layer = 6
|
||||
whisper_model_load: n_text_ctx = 448
|
||||
whisper_model_load: n_text_state = 512
|
||||
whisper_model_load: n_text_head = 8
|
||||
whisper_model_load: n_text_layer = 6
|
||||
whisper_model_load: n_mels = 80
|
||||
whisper_model_load: f16 = 1
|
||||
whisper_model_load: type = 2
|
||||
whisper_model_load: adding 1607 extra tokens
|
||||
whisper_model_load: mem_required = 506.00 MB
|
||||
whisper_model_load: ggml ctx size = 140.60 MB
|
||||
whisper_model_load: memory size = 22.83 MB
|
||||
whisper_model_load: model size = 140.54 MB
|
||||
|
||||
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
|
||||
|
||||
operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
|
||||
|
||||
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
||||
|
||||
whisper_print_timings: load time = 162.37 ms
|
||||
whisper_print_timings: mel time = 183.70 ms
|
||||
whisper_print_timings: sample time = 4.27 ms
|
||||
whisper_print_timings: encode time = 8582.63 ms / 1430.44 ms per layer
|
||||
whisper_print_timings: decode time = 436.16 ms / 72.69 ms per layer
|
||||
whisper_print_timings: total time = 9370.90 ms
|
||||
```
|
@ -1,48 +1,63 @@
|
||||
//
|
||||
// This is the Javascript API of whisper.cpp
|
||||
//
|
||||
// Very crude at the moment.
|
||||
// Feel free to contribute and make this better!
|
||||
//
|
||||
// See the tests/test-whisper.js for sample usage
|
||||
//
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
|
||||
struct whisper_context * g_context;
|
||||
std::thread g_worker;
|
||||
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
EMSCRIPTEN_BINDINGS(whisper) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
if (g_context == nullptr) {
|
||||
g_context = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
|
||||
if (g_context != nullptr) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([]() {
|
||||
if (g_context) {
|
||||
whisper_free(g_context);
|
||||
g_context = nullptr;
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
|
||||
--index;
|
||||
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("full_default", emscripten::optional_override([](const emscripten::val & audio, const std::string & lang, bool translate) {
|
||||
if (g_context == nullptr) {
|
||||
emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
|
||||
--index;
|
||||
|
||||
if (index >= g_contexts.size()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (g_contexts[index] == nullptr) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
params.print_realtime = true;
|
||||
@ -50,7 +65,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = translate;
|
||||
params.language = whisper_is_multilingual(g_context) ? lang.c_str() : "en";
|
||||
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
||||
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
|
||||
params.offset_ms = 0;
|
||||
|
||||
@ -67,11 +82,9 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
|
||||
// print system information
|
||||
{
|
||||
printf("\n");
|
||||
printf("system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
|
||||
printf("\n");
|
||||
printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
|
||||
__func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, 1,
|
||||
@ -81,11 +94,13 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// run whisper
|
||||
// run the worker
|
||||
{
|
||||
whisper_reset_timings(g_context);
|
||||
whisper_full(g_context, params, pcmf32.data(), pcmf32.size());
|
||||
whisper_print_timings(g_context);
|
||||
g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
|
||||
whisper_reset_timings(g_contexts[index]);
|
||||
whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
|
||||
whisper_print_timings(g_contexts[index]);
|
||||
});
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -1 +0,0 @@
|
||||
"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:f=>(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f),postMessage:msg=>parentPort.postMessage(msg),performance:global.performance||{now:Date.now}})}var initializedJS=false;function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var module=Module["wasmModule"];Module["wasmModule"]=null;var instance=new WebAssembly.Instance(module,info);return receiveInstance(instance)};self.onunhandledrejection=e=>{throw e.reason||e};function handleMessage(e){try{if(e.data.cmd==="load"){let messageQueue=[];self.onmessage=e=>messageQueue.push(e);self.startWorker=instance=>{Module=instance;postMessage({"cmd":"loaded"});for(let msg of messageQueue){handleMessage(msg)}self.onmessage=handleMessage};Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=(...args)=>{postMessage({cmd:"callHandler",handler:handler,args:args})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module)}else if(e.data.cmd==="run"){Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["__emscripten_thread_mailbox_await"](e.data.pthread_ptr);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){throw ex}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="checkMailbox"){if(initializedJS){Module["checkMailbox"]()}}else if(e.data.cmd){err(`worker.js received unknown command ${e.data.cmd}`);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}}self.onmessage=handleMessage;
|
@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "@PROJECT_VERSION@",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
"test": "echo \"todo: add tests\" && exit 0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/ggerganov/whisper.cpp"
|
||||
},
|
||||
"keywords": [
|
||||
"openai",
|
||||
"whisper",
|
||||
"speech-to-text",
|
||||
"speech-recognition",
|
||||
"transformer"
|
||||
],
|
||||
"author": "Georgi Gerganov",
|
||||
"license": "MIT",
|
||||
"bugs": {
|
||||
"url": "https://github.com/ggerganov/whisper.cpp/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ggerganov/whisper.cpp#readme"
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
"test": "echo \"todo: add tests\" && exit 0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/ggerganov/whisper.cpp"
|
||||
},
|
||||
"keywords": [
|
||||
"openai",
|
||||
"whisper",
|
||||
"speech-to-text",
|
||||
"speech-recognition",
|
||||
"transformer"
|
||||
],
|
||||
"author": "Georgi Gerganov",
|
||||
"license": "MIT",
|
||||
"bugs": {
|
||||
"url": "https://github.com/ggerganov/whisper.cpp/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ggerganov/whisper.cpp#readme"
|
||||
}
|
File diff suppressed because one or more lines are too long
3
bindings/ruby/.gitignore
vendored
3
bindings/ruby/.gitignore
vendored
@ -1,3 +0,0 @@
|
||||
LICENSE
|
||||
pkg/
|
||||
lib/whisper.*
|
@ -1,243 +0,0 @@
|
||||
whispercpp
|
||||
==========
|
||||
|
||||

|
||||
|
||||
Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Install the gem and add to the application's Gemfile by executing:
|
||||
|
||||
$ bundle add whispercpp
|
||||
|
||||
If bundler is not being used to manage dependencies, install the gem by executing:
|
||||
|
||||
$ gem install whispercpp
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
```ruby
|
||||
require "whisper"
|
||||
|
||||
whisper = Whisper::Context.new("base")
|
||||
|
||||
params = Whisper::Params.new
|
||||
params.language = "en"
|
||||
params.offset = 10_000
|
||||
params.duration = 60_000
|
||||
params.max_text_tokens = 300
|
||||
params.translate = true
|
||||
params.print_timestamps = false
|
||||
params.initial_prompt = "Initial prompt here."
|
||||
|
||||
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
|
||||
puts whole_text
|
||||
end
|
||||
|
||||
```
|
||||
|
||||
### Preparing model ###
|
||||
|
||||
Some models are prepared up-front:
|
||||
|
||||
```ruby
|
||||
base_en = Whisper::Model.pre_converted_models["base.en"]
|
||||
whisper = Whisper::Context.new(base_en)
|
||||
```
|
||||
|
||||
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
|
||||
|
||||
```ruby
|
||||
Whisper::Model.pre_converted_models["base"].clear_cache
|
||||
```
|
||||
|
||||
You also can use shorthand for pre-converted models:
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
```
|
||||
|
||||
You can see the list of prepared model names by `Whisper::Model.pre_converted_models.keys`:
|
||||
|
||||
```ruby
|
||||
puts Whisper::Model.pre_converted_models.keys
|
||||
# tiny
|
||||
# tiny.en
|
||||
# tiny-q5_1
|
||||
# tiny.en-q5_1
|
||||
# tiny-q8_0
|
||||
# base
|
||||
# base.en
|
||||
# base-q5_1
|
||||
# base.en-q5_1
|
||||
# base-q8_0
|
||||
# :
|
||||
# :
|
||||
```
|
||||
|
||||
You can also use local model files you prepared:
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("path/to/your/model.bin")
|
||||
```
|
||||
|
||||
Or, you can download model files:
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
|
||||
# Or
|
||||
whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
|
||||
```
|
||||
|
||||
See [models][] page for details.
|
||||
|
||||
### Preparing audio file ###
|
||||
|
||||
Currently, whisper.cpp accepts only 16-bit WAV files.
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
### Segments ###
|
||||
|
||||
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
||||
|
||||
```ruby
|
||||
def format_time(time_ms)
|
||||
sec, decimal_part = time_ms.divmod(1000)
|
||||
min, sec = sec.divmod(60)
|
||||
hour, min = min.divmod(60)
|
||||
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
|
||||
end
|
||||
|
||||
whisper.transcribe("path/to/audio.wav", params)
|
||||
|
||||
whisper.each_segment.with_index do |segment, index|
|
||||
line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
|
||||
nth: index + 1,
|
||||
st: format_time(segment.start_time),
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
```
|
||||
|
||||
You can also add hook to params called on new segment:
|
||||
|
||||
```ruby
|
||||
# Add hook before calling #transcribe
|
||||
params.on_new_segment do |segment|
|
||||
line = "[%{st} --> %{ed}] %{text}" % {
|
||||
st: format_time(segment.start_time),
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
whisper.transcribe("path/to/audio.wav", params)
|
||||
|
||||
```
|
||||
|
||||
### Models ###
|
||||
|
||||
You can see model information:
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("base")
|
||||
model = whisper.model
|
||||
|
||||
model.n_vocab # => 51864
|
||||
model.n_audio_ctx # => 1500
|
||||
model.n_audio_state # => 512
|
||||
model.n_audio_head # => 8
|
||||
model.n_audio_layer # => 6
|
||||
model.n_text_ctx # => 448
|
||||
model.n_text_state # => 512
|
||||
model.n_text_head # => 8
|
||||
model.n_text_layer # => 6
|
||||
model.n_mels # => 80
|
||||
model.ftype # => 1
|
||||
model.type # => "base"
|
||||
|
||||
```
|
||||
|
||||
### Logging ###
|
||||
|
||||
You can set log callback:
|
||||
|
||||
```ruby
|
||||
prefix = "[MyApp] "
|
||||
log_callback = ->(level, buffer, user_data) {
|
||||
case level
|
||||
when Whisper::LOG_LEVEL_NONE
|
||||
puts "#{user_data}none: #{buffer}"
|
||||
when Whisper::LOG_LEVEL_INFO
|
||||
puts "#{user_data}info: #{buffer}"
|
||||
when Whisper::LOG_LEVEL_WARN
|
||||
puts "#{user_data}warn: #{buffer}"
|
||||
when Whisper::LOG_LEVEL_ERROR
|
||||
puts "#{user_data}error: #{buffer}"
|
||||
when Whisper::LOG_LEVEL_DEBUG
|
||||
puts "#{user_data}debug: #{buffer}"
|
||||
when Whisper::LOG_LEVEL_CONT
|
||||
puts "#{user_data}same to previous: #{buffer}"
|
||||
end
|
||||
}
|
||||
Whisper.log_set log_callback, prefix
|
||||
```
|
||||
|
||||
Using this feature, you are also able to suppress log:
|
||||
|
||||
```ruby
|
||||
Whisper.log_set ->(level, buffer, user_data) {
|
||||
# do nothing
|
||||
}, nil
|
||||
Whisper::Context.new("base")
|
||||
```
|
||||
|
||||
### Low-level API to transcribe ###
|
||||
|
||||
You can also call `Whisper::Context#full` and `#full_parallel` with a Ruby array as samples. Although `#transcribe` with audio file path is recommended because it extracts PCM samples in C++ and is fast, `#full` and `#full_parallel` give you flexibility.
|
||||
|
||||
```ruby
|
||||
require "whisper"
|
||||
require "wavefile"
|
||||
|
||||
reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000))
|
||||
samples = reader.enum_for(:each_buffer).map(&:samples).flatten
|
||||
|
||||
whisper = Whisper::Context.new("base")
|
||||
whisper.full(Whisper::Params.new, samples)
|
||||
whisper.each_segment do |segment|
|
||||
puts segment.text
|
||||
end
|
||||
```
|
||||
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
|
||||
Development
|
||||
-----------
|
||||
|
||||
% git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
% cd whisper.cpp/bindings/ruby
|
||||
% rake test
|
||||
|
||||
First call of `rake test` builds an extension and downloads a model for testing. After that, you add tests in `tests` directory and modify `ext/ruby_whisper.cpp`.
|
||||
|
||||
If something seems wrong on build, running `rake clean` solves some cases.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
The same to [whisper.cpp][].
|
||||
|
||||
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
|
||||
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
|
@ -1,64 +0,0 @@
|
||||
require 'rake/clean'
|
||||
require "bundler/gem_tasks"
|
||||
require "rake/testtask"
|
||||
require_relative "extsources"
|
||||
|
||||
SOURCES = FileList[]
|
||||
|
||||
EXTSOURCES.each do |src|
|
||||
basename = src.pathmap("%f")
|
||||
dest = basename == "LICENSE" ? basename : src.pathmap("%{../..,ext}p")
|
||||
dir = dest.pathmap("%d")
|
||||
file src
|
||||
directory dir
|
||||
file dest => [src, dir] do |t|
|
||||
cp t.source, t.name
|
||||
end
|
||||
SOURCES.include dest
|
||||
end
|
||||
|
||||
CLEAN.include SOURCES
|
||||
CLEAN.include FileList["ext/*.o", "ext/*.metal", "ext/whisper.{so,bundle,dll}"]
|
||||
|
||||
task build: ["ext/Makefile", "ext/ruby_whisper.h", "ext/ruby_whisper.cpp", "whispercpp.gemspec"]
|
||||
|
||||
directory "pkg"
|
||||
CLOBBER.include "pkg"
|
||||
|
||||
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
|
||||
SO_FILE = File.join("ext", LIB_NAME)
|
||||
LIB_FILE = File.join("lib", LIB_NAME)
|
||||
|
||||
file "ext/Makefile" => ["ext/extconf.rb", "ext/ruby_whisper.h", "ext/ruby_whisper.cpp"] + SOURCES do |t|
|
||||
Dir.chdir "ext" do
|
||||
ruby "extconf.rb"
|
||||
end
|
||||
end
|
||||
|
||||
file SO_FILE => "ext/Makefile" do |t|
|
||||
Dir.chdir "ext" do
|
||||
sh "make"
|
||||
end
|
||||
end
|
||||
CLEAN.include SO_FILE
|
||||
|
||||
directory "lib"
|
||||
file LIB_FILE => [SO_FILE, "lib"] do |t|
|
||||
copy t.source, t.name
|
||||
end
|
||||
CLEAN.include LIB_FILE
|
||||
|
||||
Rake::TestTask.new do |t|
|
||||
t.test_files = FileList["tests/test_*.rb"]
|
||||
end
|
||||
|
||||
TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
|
||||
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
|
||||
Dir.chdir "tests/jfk_reader" do
|
||||
ruby "extconf.rb"
|
||||
sh "make"
|
||||
end
|
||||
end
|
||||
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
|
||||
|
||||
task test: [LIB_FILE, TEST_MEMORY_VIEW]
|
13
bindings/ruby/ext/.gitignore
vendored
13
bindings/ruby/ext/.gitignore
vendored
@ -1,13 +0,0 @@
|
||||
Makefile
|
||||
whisper.so
|
||||
whisper.bundle
|
||||
whisper.dll
|
||||
scripts/get-flags.mk
|
||||
*.o
|
||||
*.c
|
||||
*.cpp
|
||||
*.h
|
||||
*.m
|
||||
*.metal
|
||||
!ruby_whisper.cpp
|
||||
!ruby_whisper.h
|
@ -1,9 +0,0 @@
|
||||
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/include/ggml-cpu.h \
|
||||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@ -1,199 +0,0 @@
|
||||
require 'mkmf'
|
||||
|
||||
# need to use c++ compiler flags
|
||||
$CXXFLAGS << ' -std=c++17'
|
||||
|
||||
$LDFLAGS << ' -lstdc++'
|
||||
|
||||
# Set to true when building binary gems
|
||||
if enable_config('static-stdlib', false)
|
||||
$LDFLAGS << ' -static-libgcc -static-libstdc++'
|
||||
end
|
||||
|
||||
if enable_config('march-tune-native', false)
|
||||
$CFLAGS << ' -march=native -mtune=native'
|
||||
$CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_METAL']
|
||||
$GGML_METAL ||= true
|
||||
$DEPRECATE_WARNING ||= true
|
||||
end
|
||||
|
||||
$UNAME_S = `uname -s`.chomp
|
||||
$UNAME_P = `uname -p`.chomp
|
||||
$UNAME_M = `uname -m`.chomp
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
unless ENV['GGML_NO_METAL']
|
||||
$GGML_METAL ||= true
|
||||
end
|
||||
$GGML_NO_OPENMP ||= true
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$GGML_METAL_EMBED_LIBRARY = true
|
||||
end
|
||||
|
||||
$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples'
|
||||
$MK_CFLAGS = '-std=c11 -fPIC'
|
||||
$MK_CXXFLAGS = '-std=c++17 -fPIC'
|
||||
$MK_NVCCFLAGS = '-std=c++17'
|
||||
$MK_LDFLAGS = ''
|
||||
|
||||
$OBJ_GGML = []
|
||||
$OBJ_WHISPER = []
|
||||
$OBJ_COMMON = []
|
||||
$OBJ_SDL = []
|
||||
|
||||
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
|
||||
|
||||
if $UNAME_S == 'Linux'
|
||||
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
|
||||
end
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_DEBUG']
|
||||
$MK_CFLAGS << ' -O0 -g'
|
||||
$MK_CXXFLAGS << ' -O0 -g'
|
||||
$MK_LDFLAGS << ' -g'
|
||||
$MK_NVCCFLAGS << ' -O0 -g'
|
||||
else
|
||||
$MK_CPPFLAGS << ' -DNDEBUG'
|
||||
$MK_CFLAGS << ' -O3'
|
||||
$MK_CXXFLAGS << ' -O3'
|
||||
$MK_NVCCFLAGS << ' -O3'
|
||||
end
|
||||
|
||||
$WARN_FLAGS =
|
||||
' -Wall' <<
|
||||
' -Wextra' <<
|
||||
' -Wpedantic' <<
|
||||
' -Wcast-qual' <<
|
||||
' -Wno-unused-function'
|
||||
|
||||
$MK_CFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wshadow' <<
|
||||
' -Wstrict-prototypes' <<
|
||||
' -Wpointer-arith' <<
|
||||
' -Wmissing-prototypes' <<
|
||||
' -Werror=implicit-int' <<
|
||||
' -Werror=implicit-function-declaration'
|
||||
|
||||
$MK_CXXFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wmissing-declarations' <<
|
||||
' -Wmissing-noreturn'
|
||||
|
||||
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
|
||||
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
|
||||
end
|
||||
|
||||
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
|
||||
$MK_CFLAGS << ' -pthread'
|
||||
$MK_CXXFLAGS << ' -pthread'
|
||||
end
|
||||
|
||||
unless $_WIN32
|
||||
$DSO_EXT = '.so'
|
||||
else
|
||||
$DSO_EXT = '.dll'
|
||||
end
|
||||
|
||||
unless ENV['RISCV']
|
||||
if %w[x86_64 i686 amd64].include? $UNAME_M
|
||||
$HOST_CXXFLAGS ||= ''
|
||||
|
||||
$MK_CFLAGS << ' -march=native -mtune=native'
|
||||
$HOST_CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
else
|
||||
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
end
|
||||
|
||||
unless ENV['GGML_NO_ACCELERATE']
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
|
||||
$MK_LDFLAGS << ' -framework Accelerate'
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS64']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
|
||||
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
|
||||
|
||||
if ENV['GGML_METAL_NDEBUG']
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
|
||||
end
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
|
||||
end
|
||||
end
|
||||
|
||||
$OBJ_GGML <<
|
||||
'ggml/src/ggml.o' <<
|
||||
'ggml/src/ggml-alloc.o' <<
|
||||
'ggml/src/ggml-backend.o' <<
|
||||
'ggml/src/ggml-backend-reg.o' <<
|
||||
'ggml/src/ggml-opt.o' <<
|
||||
'ggml/src/ggml-quants.o' <<
|
||||
'ggml/src/ggml-threading.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-traits.o'
|
||||
|
||||
$OBJ_WHISPER <<
|
||||
'src/whisper.o'
|
||||
|
||||
$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
|
||||
$objs << "ruby_whisper.o"
|
||||
|
||||
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
|
||||
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
|
||||
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
|
||||
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
|
||||
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
|
||||
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
|
||||
|
||||
create_makefile('whisper')
|
||||
|
||||
File.open 'Makefile', 'a' do |file|
|
||||
file.puts 'include scripts/get-flags.mk'
|
||||
file.puts 'include cpu.mk'
|
||||
|
||||
if $GGML_METAL
|
||||
file.puts 'include metal.mk'
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
file.puts 'include metal-embed.mk'
|
||||
end
|
||||
end
|
||||
end
|
@ -1,17 +0,0 @@
|
||||
ggml/src/ggml-metal/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.metal \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/src/ggml-common.h
|
||||
@echo "Embedding Metal library"
|
||||
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
|
||||
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
|
||||
@rmdir ${TEMP_ASSEMBLY}
|
@ -1,6 +0,0 @@
|
||||
ggml/src/ggml-metal/ggml-metal.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.m \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/include/ggml-metal.h \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
File diff suppressed because it is too large
Load Diff
@ -1,25 +0,0 @@
|
||||
#ifndef RUBY_WHISPER_H
|
||||
#define RUBY_WHISPER_H
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
typedef struct {
|
||||
VALUE *context;
|
||||
VALUE user_data;
|
||||
VALUE callback;
|
||||
VALUE callbacks;
|
||||
} ruby_whisper_callback_container;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_context *context;
|
||||
} ruby_whisper;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_full_params params;
|
||||
bool diarize;
|
||||
ruby_whisper_callback_container *new_segment_callback_container;
|
||||
ruby_whisper_callback_container *progress_callback_container;
|
||||
ruby_whisper_callback_container *abort_callback_container;
|
||||
} ruby_whisper_params;
|
||||
|
||||
#endif
|
@ -1,6 +0,0 @@
|
||||
require "yaml"
|
||||
|
||||
sources = `git ls-files -z ../..`.split("\x0")
|
||||
paths = YAML.load_file("../../.github/workflows/bindings-ruby.yml")[true]["push"]["paths"]
|
||||
paths.delete "bindings/ruby/**"
|
||||
EXTSOURCES = (Dir.glob(paths, base: "../..").collect {|path| "../../#{path}"} << "../../LICENSE") & sources
|
@ -1,163 +0,0 @@
|
||||
require "uri"
|
||||
require "net/http"
|
||||
require "time"
|
||||
require "pathname"
|
||||
require "io/console/size"
|
||||
|
||||
module Whisper
|
||||
class Model
|
||||
class URI
|
||||
def initialize(uri)
|
||||
@uri = URI(uri)
|
||||
end
|
||||
|
||||
def to_path
|
||||
cache
|
||||
cache_path.to_path
|
||||
end
|
||||
|
||||
def clear_cache
|
||||
path = cache_path
|
||||
path.delete if path.exist?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def cache_path
|
||||
base_cache_dir/@uri.host/@uri.path[1..]
|
||||
end
|
||||
|
||||
def base_cache_dir
|
||||
base = case RUBY_PLATFORM
|
||||
when /mswin|mingw/
|
||||
ENV.key?("LOCALAPPDATA") ? Pathname(ENV["LOCALAPPDATA"]) : Pathname(Dir.home)/"AppData/Local"
|
||||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
|
||||
def cache
|
||||
path = cache_path
|
||||
headers = {}
|
||||
headers["if-modified-since"] = path.mtime.httpdate if path.exist?
|
||||
request @uri, headers
|
||||
path
|
||||
end
|
||||
|
||||
def request(uri, headers)
|
||||
Net::HTTP.start uri.host, uri.port, use_ssl: uri.scheme == "https" do |http|
|
||||
request = Net::HTTP::Get.new(uri, headers)
|
||||
http.request request do |response|
|
||||
case response
|
||||
when Net::HTTPNotModified
|
||||
# noop
|
||||
when Net::HTTPOK
|
||||
download response
|
||||
when Net::HTTPRedirection
|
||||
request URI(response["location"]), headers
|
||||
else
|
||||
return if headers.key?("if-modified-since") # Use cache file
|
||||
|
||||
raise "#{response.code} #{response.message}\n#{response.body}"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download(response)
|
||||
path = cache_path
|
||||
path.dirname.mkpath unless path.dirname.exist?
|
||||
downloading_path = Pathname("#{path}.downloading")
|
||||
size = response.content_length
|
||||
downloading_path.open "wb" do |file|
|
||||
downloaded = 0
|
||||
response.read_body do |chunk|
|
||||
file << chunk
|
||||
downloaded += chunk.bytesize
|
||||
show_progress downloaded, size
|
||||
end
|
||||
$stderr.puts
|
||||
end
|
||||
downloading_path.rename path
|
||||
end
|
||||
|
||||
def show_progress(current, size)
|
||||
progress_rate_available = size && $stderr.tty?
|
||||
|
||||
unless @prev
|
||||
@prev = Time.now
|
||||
$stderr.puts "Downloading #{@uri} to #{cache_path}"
|
||||
end
|
||||
|
||||
now = Time.now
|
||||
|
||||
if progress_rate_available
|
||||
return if now - @prev < 1 && current < size
|
||||
|
||||
progress_width = 20
|
||||
progress = current.to_f / size
|
||||
arrow_length = progress * progress_width
|
||||
arrow = "=" * (arrow_length - 1) + ">" + " " * (progress_width - arrow_length)
|
||||
line = "[#{arrow}] (#{format_bytesize(current)} / #{format_bytesize(size)})"
|
||||
padding = ' ' * ($stderr.winsize[1] - line.size)
|
||||
$stderr.print "\r#{line}#{padding}"
|
||||
else
|
||||
return if now - @prev < 1
|
||||
|
||||
$stderr.print "."
|
||||
end
|
||||
@prev = now
|
||||
end
|
||||
|
||||
def format_bytesize(bytesize)
|
||||
return "0.0 B" if bytesize.zero?
|
||||
|
||||
units = %w[B KiB MiB GiB TiB]
|
||||
exp = (Math.log(bytesize) / Math.log(1024)).to_i
|
||||
format("%.1f %s", bytesize.to_f / 1024 ** exp, units[exp])
|
||||
end
|
||||
end
|
||||
|
||||
@pre_converted_models = %w[
|
||||
tiny
|
||||
tiny.en
|
||||
tiny-q5_1
|
||||
tiny.en-q5_1
|
||||
tiny-q8_0
|
||||
base
|
||||
base.en
|
||||
base-q5_1
|
||||
base.en-q5_1
|
||||
base-q8_0
|
||||
small
|
||||
small.en
|
||||
small.en-tdrz
|
||||
small-q5_1
|
||||
small.en-q5_1
|
||||
small-q8_0
|
||||
medium
|
||||
medium.en
|
||||
medium-q5_0
|
||||
medium.en-q5_0
|
||||
medium-q8_0
|
||||
large-v1
|
||||
large-v2
|
||||
large-v2-q5_0
|
||||
large-v2-q8_0
|
||||
large-v3
|
||||
large-v3-q5_0
|
||||
large-v3-turbo
|
||||
large-v3-turbo-q5_0
|
||||
large-v3-turbo-q8_0
|
||||
].each_with_object({}) {|name, models|
|
||||
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
}
|
||||
|
||||
class << self
|
||||
attr_reader :pre_converted_models
|
||||
end
|
||||
end
|
||||
end
|
@ -1,153 +0,0 @@
|
||||
module Whisper
|
||||
interface _Samples
|
||||
def length: () -> Integer
|
||||
def each: { (Float) -> void } -> void
|
||||
end
|
||||
|
||||
type log_callback = ^(Integer level, String message, Object user_data) -> void
|
||||
type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
|
||||
type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
|
||||
type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
|
||||
|
||||
LOG_LEVEL_NONE: Integer
|
||||
LOG_LEVEL_INFO: Integer
|
||||
LOG_LEVEL_WARN: Integer
|
||||
LOG_LEVEL_ERROR: Integer
|
||||
LOG_LEVEL_DEBUG: Integer
|
||||
LOG_LEVEL_CONT: Integer
|
||||
|
||||
def self.lang_max_id: () -> Integer
|
||||
def self.lang_id: (string name) -> Integer
|
||||
def self.lang_str: (Integer id) -> String
|
||||
def self.lang_str_full: (Integer id) -> String
|
||||
def self.log_set=: (log_callback) -> log_callback
|
||||
def self.finalize_log_callback: (void) -> void # Second argument of ObjectSpace.define_finalizer
|
||||
|
||||
class Context
|
||||
def initialize: (string | _ToPath | ::URI::HTTP ) -> void
|
||||
def transcribe: (string, Params) -> void
|
||||
| (string, Params) { (String) -> void } -> void
|
||||
def model_n_vocab: () -> Integer
|
||||
def model_n_audio_ctx: () -> Integer
|
||||
def model_n_audio_state: () -> Integer
|
||||
def model_n_text_head: () -> Integer
|
||||
def model_n_text_layer: () -> Integer
|
||||
def model_n_mels: () -> Integer
|
||||
def model_ftype: () -> Integer
|
||||
def model_type: () -> String
|
||||
def full_n_segments: () -> Integer
|
||||
def full_lang_id: () -> Integer
|
||||
def full_get_segment_t0: (Integer) -> Integer
|
||||
def full_get_segment_t1: (Integer) -> Integer
|
||||
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
|
||||
def full_get_segment_text: (Integer) -> String
|
||||
def full_get_segment_no_speech_prob: (Integer) -> Float
|
||||
def full: (Params, Array[Float], ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer) -> void
|
||||
def full_parallel: (Params, Array[Float], ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer) -> void
|
||||
| (Params, _Samples, ?Integer?, Integer) -> void
|
||||
def each_segment: { (Segment) -> void } -> void
|
||||
| () -> Enumerator[Segment]
|
||||
def model: () -> Model
|
||||
end
|
||||
|
||||
class Params
|
||||
def initialize: () -> void
|
||||
def language=: (String) -> String # TODO: Enumerate lang names
|
||||
def language: () -> String
|
||||
def translate=: (boolish) -> boolish
|
||||
def translate: () -> (true | false)
|
||||
def no_context=: (boolish) -> boolish
|
||||
def no_context: () -> (true | false)
|
||||
def single_segment=: (boolish) -> boolish
|
||||
def single_segment: () -> (true | false)
|
||||
def print_special=: (boolish) -> boolish
|
||||
def print_special: () -> (true | false)
|
||||
def print_progress=: (boolish) -> boolish
|
||||
def print_progress: () -> (true | false)
|
||||
def print_realtime=: (boolish) -> boolish
|
||||
def print_realtime: () -> (true | false)
|
||||
def print_timestamps=: (boolish) -> boolish
|
||||
def print_timestamps: () -> (true | false)
|
||||
def suppress_blank=: (boolish) -> boolish
|
||||
def suppress_blank: () -> (true | false)
|
||||
def suppress_nst=: (boolish) -> boolish
|
||||
def suppress_nst: () -> (true | false)
|
||||
def token_timestamps=: (boolish) -> boolish
|
||||
def token_timestamps: () -> (true | false)
|
||||
def split_on_word=: (boolish) -> boolish
|
||||
def split_on_word: () -> (true | false)
|
||||
def initial_prompt=: (_ToS) -> _ToS
|
||||
def initial_prompt: () -> String
|
||||
def diarize=: (boolish) -> boolish
|
||||
def diarize: () -> (true | false)
|
||||
def offset=: (Integer) -> Integer
|
||||
def offset: () -> Integer
|
||||
def duration=: (Integer) -> Integer
|
||||
def duration: () -> Integer
|
||||
def max_text_tokens=: (Integer) -> Integer
|
||||
def max_text_tokens: () -> Integer
|
||||
def temperature=: (Float) -> Float
|
||||
def temperature: () -> Float
|
||||
def max_initial_ts=: (Float) -> Float
|
||||
def max_initial_ts: () -> Float
|
||||
def length_penalty=: (Float) -> Float
|
||||
def length_penalty: () -> Float
|
||||
def temperature_inc=: (Float) -> Float
|
||||
def temperature_inc: () -> Float
|
||||
def entropy_thold=: (Float) -> Float
|
||||
def entropy_thold: () -> Float
|
||||
def logprob_thold=: (Float) -> Float
|
||||
def logprob_thold: () -> Float
|
||||
def no_speech_thold=: (Float) -> Float
|
||||
def no_speech_thold: () -> Float
|
||||
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
|
||||
def new_segment_callback_user_data=: (Object) -> Object
|
||||
def progress_callback=: (progress_callback) -> progress_callback
|
||||
def progress_callback_user_data=: (Object) -> Object
|
||||
def abort_callback=: (abort_callback) -> abort_callback
|
||||
def abort_callback_user_data=: (Object) -> Object
|
||||
def on_new_segment: { (Segment) -> void } -> void
|
||||
def on_progress: { (Integer) -> void } -> void
|
||||
def abort_on: { (Object) -> boolish } -> void
|
||||
end
|
||||
|
||||
class Model
|
||||
def self.pre_converted_models: () -> Hash[String, Model::URI]
|
||||
def initialize: () -> void
|
||||
def n_vocab: () -> Integer
|
||||
def n_audio_ctx: () -> Integer
|
||||
def n_audio_state: () -> Integer
|
||||
def n_audio_head: () -> Integer
|
||||
def n_audio_layer: () -> Integer
|
||||
def n_text_ctx: () -> Integer
|
||||
def n_text_state: () -> Integer
|
||||
def n_text_head: () -> Integer
|
||||
def n_text_layer: () -> Integer
|
||||
def n_mels: () -> Integer
|
||||
def ftype: () -> Integer
|
||||
def type: () -> String
|
||||
|
||||
class URI
|
||||
def initialize: (string | ::URI::HTTP) -> void
|
||||
def to_path: -> String
|
||||
def clear_cache: -> void
|
||||
end
|
||||
end
|
||||
|
||||
class Segment
|
||||
def initialize: () -> void
|
||||
def start_time: () -> Integer
|
||||
def end_time: () -> Integer
|
||||
def speaker_next_turn?: () -> (true | false)
|
||||
def text: () -> String
|
||||
def no_speech_prob: () -> Float
|
||||
end
|
||||
|
||||
class Error < StandardError
|
||||
attr_reader code: Integer
|
||||
|
||||
def initialize: (Integer) -> void
|
||||
end
|
||||
end
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user