Compare commits

..

1 Commits

Author SHA1 Message Date
a0da7f71a2 command : wip in progress, improve guided decoding 2023-02-19 19:39:05 +02:00
965 changed files with 28046 additions and 390194 deletions

View File

@ -1,28 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git cmake libsdl2-dev wget git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV GGML_CUDA=1
RUN make base.en
ENTRYPOINT ["/app/main"]

View File

@ -1,40 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.3.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
WORKDIR /app
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV GGML_CUDA=1
RUN apt-get update && \
apt-get install -y build-essential libsdl2-dev wget cmake git \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
# Ref: https://stackoverflow.com/a/53464012
ENV CUDA_MAIN_VERSION=12.3
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
COPY .. .
RUN make base.en
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
ENV CUDA_MAIN_VERSION=12.3
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
WORKDIR /app
RUN apt-get update && \
apt-get install -y curl ffmpeg wget cmake git \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app
ENTRYPOINT [ "bash", "-c" ]

View File

@ -1,19 +0,0 @@
FROM ubuntu:22.04 AS build
WORKDIR /app
RUN apt-get update && \
apt-get install -y build-essential wget cmake git \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY .. .
RUN make base.en
FROM ubuntu:22.04 AS runtime
WORKDIR /app
RUN apt-get update && \
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app
ENTRYPOINT [ "bash", "-c" ]

View File

@ -10,13 +10,13 @@ on:
- whisper.h
jobs:
ubuntu-22:
runs-on: ubuntu-22.04
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- uses: actions/setup-go@v5
- uses: actions/setup-go@v3
with:
go-version: '^1.23'
- uses: actions/checkout@v4
go-version: '^1.19'
- uses: actions/checkout@v1
- run: |
cd bindings/go
make test

View File

@ -3,63 +3,20 @@ on:
push:
paths:
- bindings/ruby/**
- src/**/*.c
- src/**/*.cpp
- src/**/*.h
- src/**/*.m
- src/**/*.metal
- include/**/*.c
- include/**/*.cpp
- include/**/*.h
- include/**/*.m
- include/**/*.metal
- ggml/**/*.c
- ggml/**/*.cpp
- ggml/**/*.h
- ggml/**/*.m
- ggml/**/*.metal
- scripts/get-flags.mk
- examples/common.h
- examples/common.cpp
- examples/common-whisper.h
- examples/common-whisper.cpp
- examples/stb_vorbis.c
- examples/miniaudio.h
- whisper.h
pull_request:
paths:
- bindings/ruby/**
- src/**/*.c
- src/**/*.cpp
- src/**/*.h
- src/**/*.m
- src/**/*.metal
- include/**/*.c
- include/**/*.cpp
- include/**/*.h
- include/**/*.m
- include/**/*.metal
- ggml/**/*.c
- ggml/**/*.cpp
- ggml/**/*.h
- ggml/**/*.m
- ggml/**/*.metal
- scripts/get-flags.mk
- examples/common.h
- examples/common.cpp
- examples/common-whisper.h
- examples/common-whisper.cpp
- examples/stb_vorbis.c
- examples/miniaudio.h
- whisper.h
jobs:
ubuntu-22:
runs-on: ubuntu-22.04
defaults:
run:
working-directory: bindings/ruby
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- uses: ruby/setup-ruby@v1
with:
ruby-version: '3.1'
- uses: actions/checkout@v4
- run: rake test
ruby-version: '3.0'
- uses: actions/checkout@v1
- run: |
cd bindings/ruby/ext
ruby extconf.rb && make

File diff suppressed because it is too large Load Diff

View File

@ -1,61 +0,0 @@
name: Publish Docker image
on:
pull_request:
push:
branches:
- master
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
if: github.event.pull_request.draft == false
runs-on: ubuntu-22.04
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
matrix:
config:
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
#TODO: the cuda image keeps failing - disable for now
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: ${{ matrix.config.platform }}
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platform }}
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View File

@ -1,91 +0,0 @@
name: Examples WASM
on:
push:
branches: ["master"]
workflow_dispatch:
permissions:
contents: read
pages: write
id-token: write
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
deploy-wasm-github-pages:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Setup emsdk
uses: mymindstorm/setup-emsdk@v14
- name: Build WASM Examples
# Enable for real build later in whisper.cpp
run: |
mkdir -p build-em && cd build-em
emcmake cmake .. -DCMAKE_BUILD_TYPE=Release
make -j
- name: Create staging directory
run: mkdir -p staging
- name: Create .nojekyll file in staging directory
run: touch staging/.nojekyll
- name: Copy application files
run: |
build_dir=build-em/bin
ls ${build_dir}
# command.wasm
target_dir=staging/command.wasm
mkdir -p ${target_dir}
cp ${build_dir}/command.wasm/{index.html,command.js,helpers.js} ${target_dir}
cp ${build_dir}/libcommand.js ${target_dir}
# bench.wasm
target_dir=staging/bench.wasm
mkdir -p ${target_dir}
cp ${build_dir}/bench.wasm/{index.html,bench.js,helpers.js} ${target_dir}
cp ${build_dir}/libbench.js ${target_dir}
# stream.wasm
target_dir=staging/stream.wasm
mkdir -p ${target_dir}
cp ${build_dir}/stream.wasm/{index.html,stream.js,helpers.js} ${target_dir}
cp ${build_dir}/libstream.js ${target_dir}
# whisper.wasm (this will be the main example page)
target_dir=staging
mkdir -p ${target_dir}
cp ${build_dir}/whisper.wasm/{index.html,main.js,helpers.js} ${target_dir}
cp ${build_dir}/libmain.js ${target_dir}
# Copy Cross-Origin Isolation service worker
cp -v examples/coi-serviceworker.js staging/
- name: List files in staging directory (for debugging)
run: |
echo "Files in staging directory:"
find staging -type f | sort
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: ./staging
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View File

@ -10,8 +10,8 @@ on:
- whisper.h
jobs:
addon_node-ubuntu-22:
runs-on: ubuntu-22.04
addon_node-ubuntu-latest:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [ 16.x, 18.x ]
@ -22,7 +22,7 @@ jobs:
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential git
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
@ -37,7 +37,7 @@ jobs:
run: npm install
- name: Compile addon.node
run: npx cmake-js compile -T addon.node -B Release
run: npx cmake-js compile -T whisper-addon -B Release
- name: Download test model
run: |

41
.gitignore vendored
View File

@ -1,37 +1,23 @@
*.o
*.a
*.d
.cache/
.coreml/
.test/
.venv/
.vs/
.vscode/
.DS_Store
.vimspector.json
/CMakeSettings.json
/talk-llama.dSYM/
build/
build-*/
# SPM
.build/
.swiftpm
*.metallib
ggml-metal-embed.metal
ggml-metal-embed.metal.tmp
build-em/
build-debug/
build-release/
build-static/
build-sanitize-addr/
build-sanitize-thread/
/main
/stream
/command
/talk
/talk-llama
/bench
/quantize
/server
/lsp
arm_neon.h
sync.sh
@ -45,18 +31,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
extra/bench-gg.txt
models/*.mlmodel
models/*.mlmodelc
models/*.mlpackage
bindings/java/.gradle/
bindings/java/.idea/
.idea/
benchmark_results.csv
cmake-build-debug/
.cxx/
.gradle/
local.properties
.log
.exe

3
.gitmodules vendored
View File

@ -0,0 +1,3 @@
[submodule "bindings/ios"]
path = bindings/ios
url = https://github.com/ggerganov/whisper.spm

510
AUTHORS
View File

@ -1,510 +0,0 @@
# date: Tue Feb 4 13:03:35 EET 2025
# this file is auto-generated by scripts/gen-authors.sh
0/0 <zero@imaskeleton.me>
0cc4m <picard12@live.de>
0xsourcecode <134374803+0xsourcecode@users.noreply.github.com>
65a <10104049+65a@users.noreply.github.com>
AIWintermuteAI <32562299+AIWintermuteAI@users.noreply.github.com>
AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi>
Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Aaron Taylor <aaron@exphat.com>
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Abitofevrything <54505189+abitofevrything@users.noreply.github.com>
Adam Jones <domdomegg+git@gmail.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
AfryMask <AfryMask@163.com>
Ahmad Bilal <ahmad.bilal@empglabs.com>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Akash Mahajan <akash7190@gmail.com>
Akash Mahajan <akashmjn@stanford.edu>
Al Hoang <3811822-hoanga@users.noreply.gitlab.com>
Alan <unknown>
Albert Jin <albert.jin@gmail.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Aleksander Andrzejewski <18704749+aleksanderandrzejewski@users.noreply.github.com>
Alex Azarov <alex@azarov.by>
Alex Bacart <13940752+alex-bacart@users.noreply.github.com>
Alex Evgrashin <aevgrashin@yandex.ru>
Alex O'Connell <35843486+acon96@users.noreply.github.com>
Alexandr Graschenkov <alexandr.graschenkov91@gmail.com>
Alexandru Mariuti <alex@mariuti.com>
Alexey Kharlamov <alexey@kharlamov.biz>
Alfredo Montesinos <alfredo.montesinos@g.austincc.edu>
Ali Alameh <ali.alameh@isae.edu.lb>
Alter <0x7c48@gmail.com>
Ananta Bastola <anantarajbastola@gmail.com>
Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
Andreas Lubbe <git@lubbe.org>
Andreu Huguet <andreuhuguet@gmail.com>
Andrew Huynh <a5thuynh@gmail.com>
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andrew S <andrews54757@gmail.com>
Andy Maloney <asmaloney@gmail.com>
Anton Kostin <masguit42@users.noreply.github.com>
Artyom Mezin <psycho.fading@gmail.com>
Asad Memon <asad.lionpk@gmail.com>
Ashraful Islam <ashraful.meche@gmail.com>
AsukaMinato <asukaminato@nyan.eu.org>
AustinMroz <austinmroz@utexas.edu>
Avik Sengupta <avik@sengupta.net>
Bader-eddine Ouaich <49657842+baderouaich@users.noreply.github.com>
Baffin Lee <baffinlee@gmail.com>
Ben Ashbaugh <ben.ashbaugh@intel.com>
Ben Nortier <bjnortier@gmail.com>
Benjamin Heiniger <benjamin.heiniger@bluewin.ch>
Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
Binozo <70137898+Binozo@users.noreply.github.com>
Bo-Yi Wu <appleboy.tw@gmail.com>
Boris Bliznioukov <blib@mail.com>
Borislav Stanimirov <b.stanimirov@abv.bg>
Brad Murray <59848399+bradmurray-dt@users.noreply.github.com>
Brian Murray <brian@bmurray.ca>
CRD716 <crd716@gmail.com>
Canis Lupus <Canis-UK@users.noreply.github.com>
Carlos Zoido <mrgalleta@gmail.com>
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
CarterLi999 <664681047@qq.com>
ChangSeok Oh <shivamidow@users.noreply.github.com>
Changyeon Kim <cyzero.kim@samsung.com>
Chaoqun <27287694+OpenWaygate@users.noreply.github.com>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
Chen Xi <xixichen08@foxmail.com>
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Chia-Hsiang Cheng <88014292+garychia@users.noreply.github.com>
Chidi Williams <williamschidi1@gmail.com>
Chris Elrod <elrodc@gmail.com>
Christian <12550267+iceychris@users.noreply.github.com>
Christian Kastner <ckk@kvr.at>
Clifford Heath <clifford.heath@gmail.com>
Clint Herron <hanclinto@gmail.com>
Colin <github@whoisc.cc>
Conrad Kramer <conrad@conradkramer.com>
Corey Earwood <iamcgn+github@gmail.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
DAN™ <dranger003@gmail.com>
DGdev91 <DGdev91@users.noreply.github.com>
Damian Czaja <trojan295@protonmail.com>
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Dan Johansson <dan.johansson@arm.com>
Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Valdivia <18384552+dvaldivia@users.noreply.github.com>
Daniel Ziegenberg <daniel@ziegenberg.at>
Daniele <57776841+daniandtheweb@users.noreply.github.com>
Dave <dave-fl@users.noreply.github.com>
Dave Airlie <airlied@gmail.com>
Dave Airlie <airlied@redhat.com>
Daven Sanassy <daven@vochlea.co.uk>
David <dnhkng@gmail.com>
David Thorpe <djt@mutablelogic.com>
DavidKorczynski <david@adalogics.com>
Davidson Francis <davidsondfgl@gmail.com>
Dener Stassun <denerstassun@gmail.com>
Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com>
Digipom <admin@digipom.com>
Dimo <dimo@ieee.org>
Djip007 <3705339+Djip007@users.noreply.github.com>
Djip007 <djip.perois@free.fr>
Dody Suria Wijaya <dodysw@gmail.com>
Dou Xinpeng <15529241576@163.com>
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
Duncan McConnell <ddmcconnell4@gmail.com>
Egor Egorov <me@egorfine.com>
Elkana Bardugo <ttv200@gmail.com>
Emmanuel Schmidbauer <eschmidbauer@gmail.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com>
Eric Curtin <ericcurtin17@gmail.com>
Eric Swanson <eswanson@alloscomp.com>
Eric Tendian <erictendian@gmail.com>
Eric Zhang <34133756+EZForever@users.noreply.github.com>
Erik Scholz <Green-Sky@users.noreply.github.com>
Evan Jones <evan.q.jones@gmail.com>
Evan Martin <evan.martin@gmail.com>
Eve <139727413+netrunnereve@users.noreply.github.com>
Evgeny Kuznetsov <evgeny@kuznetsov.md>
F1L1P <78918286+F1L1Pv2@users.noreply.github.com>
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Fangjun Kuang <csukuangfj@gmail.com>
Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
FlippFuzz <41221030+FlippFuzz@users.noreply.github.com>
Frankie Robertson <frankier@users.noreply.github.com>
Gang Chen <goncha@gmail.com>
Gavin Cai <gavin1818@hotmail.com>
George Hindle <george@georgehindle.com>
Georgi Gerganov <ggerganov@gmail.com>
Gilad S <7817232+giladgd@users.noreply.github.com>
Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
GitAritron <103900385+GitAritron@users.noreply.github.com>
GiviMAD <GiviMAD@users.noreply.github.com>
Gleicon Moraes <gleicon@gmail.com>
Gregor Jasny <gjasny@googlemail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com>
HY. Kelvin Lee <34256578+hykelvinlee42@users.noreply.github.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
Hang <bebound@gmail.com>
Haus1 <haus.xda@gmail.com>
Herman Semenov <GermanAizek@yandex.ru>
HimariO <dsfhe49854@gmail.com>
Hong Bo PENG <penghb@cn.ibm.com>
Hrishikesh Barman <geekodour@users.noreply.github.com>
Hugo <hugo@whynothugo.nl>
Ian Bicking <ian@ianbicking.org>
Ian Bull <irbull@eclipsesource.com>
Ihar Hrachyshka <ihrachys@redhat.com>
Ikko Ashimine <eltociear@gmail.com>
Ikko Eltociear Ashimine <eltociear@gmail.com>
InconsolableCellist <23345188+InconsolableCellist@users.noreply.github.com>
Ismatulla Mansurov <47342870+sapoepsilon@users.noreply.github.com>
Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Gorin <ivangorin21@gmail.com>
Ivo von Putzer Reibegg <ivo.putzer@gmail.com>
JJ <103335846+computerscienceiscool@users.noreply.github.com>
Jack Mousseau <jmousseau@users.noreply.github.com>
JacobLinCool <jacoblincool@gmail.com>
Jakub Ráček <blizzcz@gmail.com>
Jared Van Bortel <jared@nomic.ai>
Jay Binks <jaybinks@gmail.com>
Jayant <jayantyadav202@gmail.com>
Jeff Bolz <jbolz@nvidia.com>
Jeroen Mostert <jeroen.mostert@cm.com>
Jhen-Jie Hong <developer@jhen.me>
Jhen-Jie Hong <iainst0409@gmail.com>
JidongZhang-THU <1119708529@qq.com>
Jo Liss <joliss42@gmail.com>
Joe Todd <joe.todd@codeplay.com>
Johan <jr.raffin@gmail.com>
Johannes Gäßler <johannesg@5d6.de>
John Balis <phobossystems@gmail.com>
JohnnyB <jboero@users.noreply.github.com>
Jonathan Soo <jcsoo@agora.com>
Jonno <1160532+razodactyl@users.noreply.github.com>
Joonas Pihlajamaa <joonas.pihlajamaa@iki.fi>
Jose <34888496+Jerry-Master@users.noreply.github.com>
Josh Bleecher Snyder <josharian@gmail.com>
Josscii <jossciiweiyi@gmail.com>
Judd <foldl@users.noreply.github.com>
Jumper775 <78500318+jumpers775@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com>
Junil Kim <logyourself@gmail.com>
Justina Cho <justcho5@gmail.com>
Justine Tunney <jtunney@gmail.com>
Justine Tunney <jtunney@mozilla.com>
KITAITI Makoto <KitaitiMakoto@gmail.com>
KP Kaiser <kirk@zothcorp.com>
Kamilake <exjang0@gmail.com>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Karthick <j.karthic2004@gmail.com>
Kartik Saranathan <278928+Kartiku@users.noreply.github.com>
Kasumi <90275229+kasumi-1@users.noreply.github.com>
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Kendrick Taylor <kendrick@circuitsix.com>
Kevin Brothaler <admin@digipom.com>
Kevin Gibbons <bakkot@gmail.com>
Konosuke Sakai <konosuke@konosuke.work>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
Kreijstal <rainb@tfwno.gf>
Kylin <56434533+KyL0N@users.noreply.github.com>
LBlue <153975653+lbluep@users.noreply.github.com>
Larry Battle <larry.battle.tech@gmail.com>
Laytan Laats <laytanlaats@hotmail.com>
Leo Moll <leo.moll@yeasoft.com>
Lexevolution <31176843+Lexevolution@users.noreply.github.com>
LittleLoli <26589867+WhichWho@users.noreply.github.com>
Lucas Zanek <57494138+LucasZNK@users.noreply.github.com>
Luis Herrera <herrera-luis@users.noreply.github.com>
Lukas Rist <glaslos@gmail.com>
M. A. Ali <73258591+MightyStud@users.noreply.github.com>
M. Eren Akbiyik <erenakbiyik@gmail.com>
Ma Mingfei <mingfei.ma@intel.com>
Maciek <maciek.mab122@gmail.com>
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
Marcin Mielniczuk <marmistrz.dev@zoho.eu>
Mark Karpelès <MagicalTux@users.noreply.github.com>
Mark Zhuang <zhuangqiubin@gmail.com>
Markus Tavenrath <mtavenrath@users.noreply.github.com>
Martin Delille <martin@delille.org>
Martin Warnaar <martinwarnaar@gmail.com>
Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
Matheus de Sousa <23645013+keyehzy@users.noreply.github.com>
Mathieu Baudier <mbaudier@argeo.org>
Mathijs de Bruin <mathijs@mathijsfietst.nl>
Matija Pevec <mightymatth@users.noreply.github.com>
Matt Stephenson <mstephenson6@users.noreply.github.com>
Max Krasnyansky <max.krasnyansky@gmail.com>
Max Krasnyansky <quic_maxk@quicinc.com>
Maximiliano Levi <8160966+maxilevi@users.noreply.github.com>
Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com>
Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Rienstra <mrienstra@gmail.com>
Mikhail Grigorev <sleuthhound@gmail.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohit Agarwal <mohit@sdf.org>
Molly Sophia <mollysophia379@gmail.com>
Murilo Santana <mvrilo@gmail.com>
NETZkultur GmbH <mulholland@netzkultur.de>
Natsu <chino@hotococoa.moe>
Neil Chudleigh <nchudleigh@users.noreply.github.com>
Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com>
Nicholai Tukanov <nicholaitukanov@gmail.com>
Nicholas Albion <nalbion@yahoo.com>
Nico Bosshard <nico@bosshome.ch>
Nicolò Scipione <nicolo.scipione@codeplay.com>
Niels Mayer <Niels.Mayer@gmail.com>
Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Nikolaj Olsson <nikse.dk@gmail.com>
Okabintaro <103938900+Okabintaro@users.noreply.github.com>
Oleg Sidorov <me@whitebox.io>
Oleg Sidorov <oleg@sidorov.nl>
Olivier Chafik <ochafik@users.noreply.github.com>
Ondrej Kokes <ondrej.kokes@gmail.com>
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
PAB <pierreantoine.bannier@gmail.com>
Paul Tsochantaris <ptsochantaris@icloud.com>
Pedro Probst <pprobst@insiberia.net>
Peng <hzp1024@qq.com>
Peter <peter277@users.noreply.github.com>
Philipp Zabel <philipp.zabel@gmail.com>
Philippe Normand <phil@base-art.net>
Philippe Normand <philn@igalia.com>
Plamen Minev <pacominev@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com>
Qianhe Chen <54462604+chenqianhe@users.noreply.github.com>
R0CKSTAR <xiaodong.ye@mthreads.com>
R0CKSTAR <yeahdongcn@gmail.com>
Radoslav Gerganov <rgerganov@gmail.com>
Radosław Gryta <radek.gryta@gmail.com>
Rahul Vadhyar <107788610+RahulVadhyar@users.noreply.github.com>
Raiya Araki <83504221+rai62@users.noreply.github.com>
Reinforce-II <fate@eastal.com>
Reinis Muiznieks <muiznieks.reinis@gmail.com>
RelatedTitle <r3latedtitle@gmail.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
RhinoDevel <RhinoDevel@users.noreply.github.com>
Rich Jones <miserlou@gmail.com>
Robert Ormandi <52251610+ormandi@users.noreply.github.com>
Robin <robin.xw@hotmail.com>
Roddur Dasgupta <roddurd@gmail.com>
Roland Rabien <figbug@gmail.com>
Romain Biessy <romain.biessy@codeplay.com>
Ronsor <ronsor@ronsor.pw>
Rotem Dan <rotemdan@gmail.com>
Ryan Hitchman <hitchmanr@gmail.com>
Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com>
RyanChang <ftes90015@gmail.com>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
Sacha Arbonel <sacha.arbonel@hotmail.fr>
Salman Faroz <stsfaroz@gmail.com>
Salvatore Mesoraca <s.mesoraca16@gmail.com>
Sam <49637763+Onlyartist9@users.noreply.github.com>
Sam Pullara <spullara@gmail.com>
Samuel Durante <44513615+samueldurantes@users.noreply.github.com>
Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Sandro Hanea <40202887+sandrohanea@users.noreply.github.com>
Sergio López <slp@redhat.com>
Sergio López <slp@sinrega.org>
Shanshan Shen <467638484@qq.com>
Shijie <821898965@qq.com>
Shupei Fan <dymarkfan@outlook.com>
Siddharth Ramakrishnan <srr2141@columbia.edu>
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Simon Moisselin <simon.moisstoll@gmail.com>
Sindre Sorhus <sindresorhus@gmail.com>
Slava Primenko <primenko.s@gmail.com>
Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Stavros Panakakis <53979866+Stavrospanakakis@users.noreply.github.com>
Stefan Sydow <s.sydow@heinlein-video.de>
Stefan Sydow <stefan@sydow.email>
Syahmi Azhar <prsyahmi@gmail.com>
Syed Jafri <syedjafri97@gmail.com>
Sơn Phan Trung <phantrungson17@gmail.com>
Taisei Mima <bhbstar.me@gmail.com>
Takeshi Inoue <inoue.takeshi@gmail.com>
Tamotsu Takahashi <ttakah+github@gmail.com>
Taras Glek <taras@thegp.com>
Tauseef Mohiuddin <35351464+tauseefmohammed2@users.noreply.github.com>
Thamster <Thamster@users.noreply.github.com>
Thijs Raymakers <thijs@raymakers.nl>
Thomas Fitzsimmons <fitzsim@fitzsim.org>
Tiago Fassoni <tiagofassoni@users.noreply.github.com>
Tienshiao Ma <tienshiao@tienshiao.org>
Tim Miller <drasticactions@users.noreply.github.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Tobrun <tobrun.van.nuland@gmail.com>
Todd <taf2@users.noreply.github.com>
Toliver <teejae@gmail.com>
Tong Li <31761981+litongjava@users.noreply.github.com>
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
Topping1 <78745143+Topping1@users.noreply.github.com>
Travis Cline <travis.cline@gmail.com>
UEXTM.com <84163508+uextm@users.noreply.github.com>
UsernamesLame <156965854+UsernamesLame@users.noreply.github.com>
Vadim Peretokin <vperetokin@hey.com>
Valentin Gosu <1454649+valenting@users.noreply.github.com>
Vin Misra <vinith@alum.mit.edu>
Vulcan <93451215+trholding@users.noreply.github.com>
WhiteOlivierus <36532695+WhiteOlivierus@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
Wilson Silva <wilson.dsigns@gmail.com>
Xiang (Kevin) Li <kevinli020508@gmail.com>
Xiao-Yong Jin <jinxiaoyong@gmail.com>
XiaotaoChen <chenxiaotao1234@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com>
Yajing Tang <phillis@google.com>
Yang Shen <aplshenyang@gmail.com>
Yunès <jean.baptiste.yunes@free.fr>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Yusuf Redžić <48274562+redzic@users.noreply.github.com>
ZaBlazzingZephyrus <119159668+blazingzephyr@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com>
Zhiyuan Li <uniartisan2017@gmail.com>
Zigfrid Zvezdin <ziggerZZ@gmail.com>
Zollner <24618122+Zolliner@users.noreply.github.com>
a3sh <38979186+A3shTnT@users.noreply.github.com>
ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
agray3 <agray3@users.noreply.github.com>
ai-at-home <149282006+ai-at-home@users.noreply.github.com>
aldorof <aldorof@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com>
amd-dwang <dong.wang@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com>
andypayne <apayne@gmail.com>
ardfork <134447697+ardfork@users.noreply.github.com>
arizhih <40765267+arizhih@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com>
bandoti <141645996+bandoti@users.noreply.github.com>
be-next <jerome.ramette@gmail.com>
bert hubert <bert@hubertnet.nl>
billyct <billy_allen@126.com>
bmwl <brian.marshall@tolko.com>
bobqianic <129547291+bobqianic@users.noreply.github.com>
bocytko <bocytko+github@gmail.com>
boolemancer <48014766+boolemancer@users.noreply.github.com>
boolemancer <boolemancer@gmail.com>
bradmit <151883577+bradmit@users.noreply.github.com>
brunofaustino <b.fa.amorim@gmail.com>
bssrdf <merlintiger@hotmail.com>
byte-6174 <88070277+byte-6174@users.noreply.github.com>
cdosoftei <ciprian.dosoftei@gmail.com>
clach04 <Chris.Clark@actian.com>
compilade <113953597+compilade@users.noreply.github.com>
compilade <git@compilade.net>
conradg <conradjgodfrey@gmail.com>
crummyh <elijah@crums.us>
ddpasa <112642920+ddpasa@users.noreply.github.com>
denersc <denerstassun@gmail.com>
dscripka <dscripka@users.noreply.github.com>
duthils <duthils@duthils.net>
ecneladis <ecneladis@users.noreply.github.com>
faker <nspyia2002@gmail.com>
fitzsim <fitzsim@fitzsim.org>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
genevera (she/her) <genevera@users.noreply.github.com>
geniusnut <geniusnut@gmail.com>
gilbertgong <gilbert.gong@gmail.com>
gn64 <yukikaze.jp@gmail.com>
goldwaving <77494627+goldwaving@users.noreply.github.com>
greeshmay <greeshmay@gmail.com>
haopeng <657407891@qq.com>
hipudding <huafengchun@gmail.com>
hsinhoyeh <yhh92u@gmail.com>
hydai <z54981220@gmail.com>
iamthad <thadeus.j.fleming@gmail.com>
issixx <46835150+issixx@users.noreply.github.com>
james wolf <contractorwolf@hotmail.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jettoblack <jettoblack@gmail.com>
jiez <373447296@qq.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
jorismertz <35079666+jorismertz@users.noreply.github.com>
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
junkfood <69683722+JunkFood02@users.noreply.github.com>
jwijffels <jwijffels@bnosac.be>
k.h.lai <adrian.k.h.lai@outlook.com>
kamranjon <kamranjon@gmail.com>
katsu560 <katsu560oo-@docomo.ne.jp>
kennethge <57784063+kenneth-ge@users.noreply.github.com>
keyehzy <msamuel@aluno.puc-rio.br>
kunnis <kunnis@users.noreply.github.com>
l3utterfly <gc.pthzfoldr@gmail.com>
leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com>
lhez <quic_lih@quicinc.com>
litong <31761981+litongjava@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lnyan <lkwq007@gmail.com>
luoyu-intel <yu.luo@intel.com>
m.bell <m.bell@techsmith.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
mashizora <30516315+mashizora@users.noreply.github.com>
matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it>
mgrachten <maarten@grachten.eu>
mkiol <mkiol@users.noreply.github.com>
mky_coder <47767389+mkycoder@users.noreply.github.com>
novag <7754358+novag@users.noreply.github.com>
pajowu <pajowu@pajowu.de>
pengxin99 <pengxin.yuan@intel.com>
petterreinholdtsen <pere-github@hungry.com>
polarmoon <90010972+polarmoon@users.noreply.github.com>
rlapray <lapray.romain@gmail.com>
sandrohanea <40202887+sandrohanea@users.noreply.github.com>
semiformal-net <84111142+semiformal-net@users.noreply.github.com>
shibukazu <61775791+shibukazu@users.noreply.github.com>
shikokuchuo <53399081+shikokuchuo@users.noreply.github.com>
slaren <slarengh@gmail.com>
slashlib <slashlib@users.noreply.github.com>
snadampal <87143774+snadampal@users.noreply.github.com>
someone13574 <81528246+someone13574@users.noreply.github.com>
st-gr <38470677+st-gr@users.noreply.github.com>
stduhpf <stephduh@live.fr>
stormofice <58337328+stormofice@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com>
thefinaldegree <thefinaldegree@gmail.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
toboil-features <160222185+toboil-features@users.noreply.github.com>
trixirt <trix@redhat.com>
ulatekh <ulatekh@yahoo.com>
undef <undefdev@gmail.com>
uvos <devnull@uvos.xyz>
uvos <philipp@uvos.xyz>
valVk <valVk@users.noreply.github.com>
venkr <venkateshrameshkumar+1@gmail.com>
vicalloy <zbirder@gmail.com>
wangshuai09 <391746016@qq.com>
woachk <24752637+woachk@users.noreply.github.com>
xctan <axunlei@gmail.com>
xdrudis <xavierdrudis@yahoo.es>
yuri@FreeBSD <yuri@FreeBSD>
zhangjixiong <code.zjx@gmail.com>
zhentaoyu <zhentao.yu@intel.com>
zhouwg <6889919+zhouwg@users.noreply.github.com>
zhouwg <zhouwg2000@gmail.com>
谢乃闻 <sienaiwun@users.noreply.github.com>
布客飞龙 <562826179@qq.com>
Артём Земляк <azemlyak@smart-consulting.ru>

View File

@ -1,31 +1,21 @@
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
project("whisper.cpp" C CXX)
project("whisper.cpp" VERSION 1.7.4)
include(CheckIncludeFileCXX)
cmake_minimum_required (VERSION 3.0)
set(SOVERSION 1)
#set(CMAKE_WARN_DEPRECATED YES)
set(CMAKE_WARN_UNUSED_CLI YES)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
project(whisper.cpp VERSION 1.2.0)
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(WHISPER_STANDALONE ON)
include(git-vars)
include(GitVars)
include(BuildTypes)
# configure project version
if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
endif()
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
else()
set(WHISPER_STANDALONE OFF)
@ -35,16 +25,6 @@ if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
# TODO: without these, we get the following error:
# wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s TOTAL_STACK=5242880")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s TOTAL_STACK=5242880")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -53,144 +33,220 @@ else()
endif()
endif()
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
# options
#
# option list
#
option(BUILD_SHARED_LIBS "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
# general
option(WHISPER_CCACHE "whisper: use ccache if available" ON)
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
# debug
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
# build
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
else()
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
endif()
option(WHISPER_PERF "whisper: enable perf timings" OFF)
# sanitizers
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
# extra artifacts
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_SERVER "whisper: build server example" ${WHISPER_STANDALONE})
if (NOT MSVC)
if (WHISPER_SANITIZE_THREAD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
endif()
# 3rd party libs
option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
if (WHISPER_SANITIZE_ADDRESS)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
endif()
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
if (WHISPER_SANITIZE_UNDEFINED)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
endif()
endif()
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
# dependencies
# override ggml options
set(GGML_CCACHE ${WHISPER_CCACHE})
set(GGML_SANITIZE_THREAD ${WHISPER_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${WHISPER_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${WHISPER_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${WHISPER_FATAL_WARNINGS})
find_package(Threads REQUIRED)
# transition helpers
function (whisper_option_depr TYPE OLD NEW)
if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON)
endif()
endfunction()
# on APPLE - include Accelerate framework
if (APPLE AND NOT WHISPER_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
whisper_option_depr(FATAL_ERROR WHISPER_CUBLAS GGML_CUDA)
whisper_option_depr(WARNING WHISPER_CUDA GGML_CUDA)
whisper_option_depr(WARNING WHISPER_KOMPUTE GGML_KOMPUTE)
whisper_option_depr(WARNING WHISPER_METAL GGML_METAL)
whisper_option_depr(WARNING WHISPER_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
whisper_option_depr(WARNING WHISPER_NATIVE GGML_NATIVE)
whisper_option_depr(WARNING WHISPER_OPENMP GGML_OPENMP)
whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
#
# build the library
#
if (NOT TARGET ggml)
if (WHISPER_USE_SYSTEM_GGML)
find_package(ggml REQUIRED)
if (NOT ggml_FOUND)
message(FATAL_ERROR "System-installed GGML library not found.")
endif()
add_library(ggml ALIAS ggml::ggml)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
add_subdirectory(ggml)
message(WARNING "Accelerate framework not found")
endif()
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(src)
if (WHISPER_SUPPORT_OPENBLAS)
find_library(OPENBLAS_LIB
NAMES openblas libopenblas
)
if (OPENBLAS_LIB)
message(STATUS "OpenBLAS found")
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${OPENBLAS_LIB})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
else()
message(WARNING "OpenBLAS not found")
endif()
endif()
# compiler flags
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
endif ()
if (WHISPER_ALL_WARNINGS)
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
-Wall \
-Wextra \
-Wpedantic \
-Wshadow \
-Wcast-qual \
-Wstrict-prototypes \
-Wpointer-arith \
-Wno-unused-function \
")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-Wall \
-Wextra \
-Wpedantic \
-Wcast-qual \
")
else()
# todo : msvc
endif()
endif()
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
endif()
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
else()
message(STATUS "x86 detected")
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
else()
if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
else()
if(NOT WHISPER_NO_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
if(NOT WHISPER_NO_AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
if(NOT WHISPER_NO_FMA)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
endif()
endif()
if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif()
#
# install
# whisper - this is the main library of the project
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(TARGET whisper)
set(WHISPER_BUILD_NUMBER ${BUILD_NUMBER})
set(WHISPER_BUILD_COMMIT ${BUILD_COMMIT})
set(WHISPER_INSTALL_VERSION ${CMAKE_PROJECT_VERSION})
add_library(${TARGET}
ggml.h
ggml.c
whisper.h
whisper.cpp
)
set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(WHISPER_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
include(DefaultTargetOptions)
get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
target_include_directories(${TARGET} PUBLIC
.
)
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper
PATH_VARS
WHISPER_INCLUDE_INSTALL_DIR
WHISPER_LIB_INSTALL_DIR
WHISPER_BIN_INSTALL_DIR )
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
else()
target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
endif()
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
VERSION ${WHISPER_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
if (BUILD_SHARED_LIBS)
target_link_libraries(${TARGET} PUBLIC
${CMAKE_DL_LIBS}
)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
target_compile_definitions(${TARGET} PUBLIC
WHISPER_SHARED
)
endif()
configure_file(cmake/whisper.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
@ONLY)
if (EMSCRIPTEN)
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
DESTINATION lib/pkgconfig)
target_compile_definitions(${TARGET} PUBLIC
${WHISPER_EXTRA_FLAGS}
)
set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
install(TARGETS ${TARGET}
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib/static
RUNTIME DESTINATION bin
PUBLIC_HEADER DESTINATION include
)
#
# bindings
#
add_subdirectory(bindings)
#
# programs, examples and tests
#
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
enable_testing()
add_subdirectory(tests)
endif ()

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2022 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

251
Makefile
View File

@ -1,12 +1,225 @@
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif
#
# Compile flags
#
CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
LDFLAGS =
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifeq ($(UNAME_S),Darwin)
CFLAGS += -mf16c
AVX1_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring FMA,$(AVX1_M)))
CFLAGS += -mfma
endif
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
else ifeq ($(UNAME_S),Linux)
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell grep "fma " /proc/cpuinfo)
ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
else
CFLAGS += -mfma -mf16c -mavx -mavx2
endif
endif
ifeq ($(UNAME_M),amd64)
CFLAGS += -mavx -mavx2 -mfma -mf16c
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mpower9-vector
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif
ifndef WHISPER_NO_ACCELERATE
# Mac M1 - include Accelerate framework
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef WHISPER_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
#
# Print build information
#
$(info I whisper.cpp build info: )
$(info I UNAME_S: $(UNAME_S))
$(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main
#
# Build library
#
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
whisper.o: whisper.cpp whisper.h
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
libwhisper.a: ggml.o whisper.o
$(AR) rcs libwhisper.a ggml.o whisper.o
libwhisper.so: ggml.o whisper.o
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
clean:
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
#
# Examples
#
CC_SDL=`sdl2-config --cflags --libs`
SRC_COMMON = examples/common.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp
main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
./main -h
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
bench: examples/bench/bench.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
#
# Audio samples
#
.PHONY: build
build:
cmake -B build
cmake --build build --config Release
# download a few audio samples into folder "./samples":
.PHONY: samples
samples:
@ -16,8 +229,12 @@ samples:
@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
@echo "Converting to 16-bit WAV ..."
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
@rm samples/mm1.wav
#
# Models
@ -35,24 +252,28 @@ samples:
.PHONY: medium.en
.PHONY: medium
.PHONY: large-v1
.PHONY: large-v2
.PHONY: large-v3
.PHONY: large-v3-turbo
.PHONY: large
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo:
tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
bash ./models/download-ggml-model.sh $@
cmake -B build
cmake --build build --config Release
@echo ""
@echo "==============================================="
@echo "Running $@ on all samples in ./samples ..."
@echo "==============================================="
@echo ""
@for f in samples/*$(.flac .mp3 .ogg .wav); do \
@for f in samples/*.wav; do \
echo "----------------------------------------------" ; \
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
echo "----------------------------------------------" ; \
echo "" ; \
./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
./main -m models/ggml-$@.bin -f $$f ; \
echo "" ; \
done
#
# Tests
#
.PHONY: tests
tests:
bash ./tests/run-tests.sh

735
README.md
View File

@ -1,47 +1,37 @@
# whisper.cpp
![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
[![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
> [!NOTE]
> New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
- Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
- AVX intrinsics support for x86 architectures
- [VSX intrinsics support for POWER architectures](#power-vsx-intrinsics)
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- [Integer quantization support](#quantization)
- Low memory usage (Flash Attention)
- Zero memory allocations at runtime
- [Vulkan support](#vulkan-gpu-support)
- Support for CPU-only inference
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
- [OpenVINO Support](#openvino-support)
- [Ascend NPU Support](#ascend-npu-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
- Runs on the CPU
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
Supported platforms:
- [x] Mac OS (Intel and Arm)
- [x] [iOS](examples/whisper.objc)
- [x] [Android](examples/whisper.android)
- [x] [Java](bindings/java/README.md)
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
- [x] [WebAssembly](examples/whisper.wasm)
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
The entire implementation of the model is contained in 2 source files:
- Tensor operations: [ggml.h](ggml.h) / [ggml.c](ggml.c)
- Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
@ -52,53 +42,149 @@ You can also easily make your own offline voice assistant application: [command]
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
On Apple Silicon, the inference runs fully on the GPU via Metal:
Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
- Sample usage is demonstrated in [main.cpp](examples/main)
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
- Various other examples are available in the [examples](examples) folder
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
## Quick start
First clone the repository:
First, download one of the Whisper models converted in [ggml format](models). For example:
```bash
git clone https://github.com/ggerganov/whisper.cpp.git
bash ./models/download-ggml-model.sh base.en
```
Navigate into the directory:
```
cd whisper.cpp
```
Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
Now build the [main](examples/main) example and transcribe an audio file like this:
```bash
sh ./models/download-ggml-model.sh base.en
```
Now build the [whisper-cli](examples/cli) example and transcribe an audio file like this:
```bash
# build the project
cmake -B build
cmake --build build --config Release
# build the main example
make
# transcribe an audio file
./build/bin/whisper-cli -f samples/jfk.wav
./main -f samples/jfk.wav
```
---
For a quick demo, simply run `make base.en`.
For a quick demo, simply run `make base.en`:
```java
$ make base.en
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o ggml.o -o main -framework Accelerate
./main -h
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
bash ./models/download-ggml-model.sh base.en
Downloading ggml model base.en ...
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
You can now use it like this:
$ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
===============================================
Running base.en on all samples in ./samples ...
===============================================
----------------------------------------------
[+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
----------------------------------------------
whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 512
whisper_model_load: n_text_head = 8
whisper_model_load: n_text_layer = 6
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 2
whisper_model_load: mem required = 215.00 MB (+ 6.00 MB per decoder)
whisper_model_load: kv self size = 5.25 MB
whisper_model_load: kv cross size = 17.58 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: model ctx = 140.60 MB
whisper_model_load: model size = 140.54 MB
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
whisper_print_timings: fallbacks = 0 p / 0 h
whisper_print_timings: load time = 113.81 ms
whisper_print_timings: mel time = 15.40 ms
whisper_print_timings: sample time = 11.58 ms / 27 runs ( 0.43 ms per run)
whisper_print_timings: encode time = 266.60 ms / 1 runs ( 266.60 ms per run)
whisper_print_timings: decode time = 66.11 ms / 27 runs ( 2.45 ms per run)
whisper_print_timings: total time = 476.31 ms
```
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
For detailed usage instructions, run: `./build/bin/whisper-cli -h`
For detailed usage instructions, run: `./main -h`
Note that the [whisper-cli](examples/cli) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
For example, you can use `ffmpeg` like this:
```bash
```java
ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
```
@ -107,7 +193,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
If you want some extra audio samples to play with, simply run:
```
make -j samples
make samples
```
This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@ -115,325 +201,124 @@ This will download a few more audio files from Wikipedia and convert them to 16-
You can download and run the other models as follows:
```
make -j tiny.en
make -j tiny
make -j base.en
make -j base
make -j small.en
make -j small
make -j medium.en
make -j medium
make -j large-v1
make -j large-v2
make -j large-v3
make -j large-v3-turbo
make tiny.en
make tiny
make base.en
make base
make small.en
make small
make medium.en
make medium
make large-v1
make large
```
## Memory usage
| Model | Disk | Mem |
| ------ | ------- | ------- |
| tiny | 75 MiB | ~273 MB |
| base | 142 MiB | ~388 MB |
| small | 466 MiB | ~852 MB |
| medium | 1.5 GiB | ~2.1 GB |
| large | 2.9 GiB | ~3.9 GB |
## POWER VSX Intrinsics
`whisper.cpp` supports POWER architectures and includes code which
significantly speeds operation on Linux running on POWER9/10, making it
capable of faster-than-realtime transcription on underclocked Raptor
Talos II. Ensure you have a BLAS package installed, and replace the
standard cmake setup with:
```bash
# build with GGML_BLAS defined
cmake -B build -DGGML_BLAS=1
cmake --build build --config Release
./build/bin/whisper-cli [ .. etc .. ]
## Quantization
`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
Here are the steps for creating and using a quantized model:
```bash
# quantize a model with Q5_0 method
cmake -B build
cmake --build build --config Release
./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
# run the examples as usual, specifying the quantized model file
./build/bin/whisper-cli -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
```
## Core ML support
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
speed-up - more than x3 faster compared with CPU-only execution. Here are the instructions for generating a Core ML model and using it with `whisper.cpp`:
- Install Python dependencies needed for the creation of the Core ML model:
```bash
pip install ane_transformers
pip install openai-whisper
pip install coremltools
```
- To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
- Python 3.11 is recommended.
- MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
- [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
- To create an environment, use: `conda create -n py311-whisper python=3.11 -y`
- To activate the environment, use: `conda activate py311-whisper`
- Generate a Core ML model. For example, to generate a `base.en` model, use:
```bash
./models/generate-coreml-model.sh base.en
```
This will generate the folder `models/ggml-base.en-encoder.mlmodelc`
- Build `whisper.cpp` with Core ML support:
```bash
# using CMake
cmake -B build -DWHISPER_COREML=1
cmake --build build -j --config Release
```
- Run the examples as usual. For example:
```text
$ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
...
whisper_init_state: loading Core ML model from 'models/ggml-base.en-encoder.mlmodelc'
whisper_init_state: first run on a device may take a while ...
whisper_init_state: Core ML model loaded
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
...
```
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
Next runs are faster.
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
## OpenVINO support
On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
- First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
Windows:
```powershell
cd models
python -m venv openvino_conv_env
openvino_conv_env\Scripts\activate
python -m pip install --upgrade pip
pip install -r requirements-openvino.txt
```
Linux and macOS:
```bash
cd models
python3 -m venv openvino_conv_env
source openvino_conv_env/bin/activate
python -m pip install --upgrade pip
pip install -r requirements-openvino.txt
```
- Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
```
python convert-whisper-to-openvino.py --model base.en
```
This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as `ggml` models, as that
is the default location that the OpenVINO extension will search at runtime.
- Build `whisper.cpp` with OpenVINO support:
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
Linux:
```bash
source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
```
Windows (cmd):
```powershell
C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
```
And then build the project using cmake:
```bash
cmake -B build -DWHISPER_OPENVINO=1
cmake --build build -j --config Release
```
- Run the examples as usual. For example:
```text
$ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
...
whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
whisper_ctx_init_openvino_encoder: OpenVINO model loaded
system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
...
```
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
cached for the next run.
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
## NVIDIA GPU support
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
Now build `whisper.cpp` with CUDA support:
```
cmake -B build -DGGML_CUDA=1
cmake --build build -j --config Release
```
## Vulkan GPU support
Cross-vendor solution which allows you to accelerate workload on your GPU.
First, make sure your graphics card driver provides support for Vulkan API.
Now build `whisper.cpp` with Vulkan support:
```
cmake -B build -DGGML_VULKAN=1
cmake --build build -j --config Release
```
## BLAS CPU support via OpenBLAS
Encoder processing can be accelerated on the CPU via OpenBLAS.
First, make sure you have installed `openblas`: https://www.openblas.net/
Now build `whisper.cpp` with OpenBLAS support:
```
cmake -B build -DGGML_BLAS=1
cmake --build build -j --config Release
```
## Ascend NPU support
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
First, check if your Ascend NPU device is supported:
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
Now build `whisper.cpp` with CANN support:
```
cmake -B build -DGGML_CANN=1
cmake --build build -j --config Release
```
Run the inference examples as usual, for example:
```
./build/bin/whisper-cli -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
```
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
## Docker
### Prerequisites
- Docker must be installed and running on your system.
- Create a folder to store big models & intermediate files (ex. /whisper/models)
### Images
We have two Docker images available for this project:
1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
### Usage
```shell
# download model and persist it in a local folder
docker run -it --rm \
-v path/to/models:/models \
whisper.cpp:main "./models/download-ggml-model.sh base /models"
# transcribe an audio file
docker run -it --rm \
-v path/to/models:/models \
-v path/to/audios:/audios \
whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
# transcribe an audio file in samples folder
docker run -it --rm \
-v path/to/models:/models \
whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
```
## Installing with Conan
You can install pre-built binaries for whisper.cpp or build it from source using [Conan](https://conan.io/). Use the following command:
```
conan install --requires="whisper-cpp/[*]" --build=missing
```
For detailed instructions on how to use Conan, please refer to the [Conan documentation](https://docs.conan.io/2/).
| Model | Disk | Mem | SHA |
| --- | --- | --- | --- |
| tiny | 75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Limitations
- Inference only
- No GPU support (yet)
## Another example
Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
in about half a minute on a MacBook M1 Pro, using `medium.en` model:
<details>
<summary>Expand to see the result</summary>
```java
$ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 1024
whisper_model_load: n_audio_head = 16
whisper_model_load: n_audio_layer = 24
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 1024
whisper_model_load: n_text_head = 16
whisper_model_load: n_text_layer = 24
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 4
whisper_model_load: mem required = 1720.00 MB (+ 43.00 MB per decoder)
whisper_model_load: kv self size = 42.00 MB
whisper_model_load: kv cross size = 140.62 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: model ctx = 1462.35 MB
whisper_model_load: model size = 1462.12 MB
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:08.000] My fellow Americans, this day has brought terrible news and great sadness to our country.
[00:00:08.000 --> 00:00:17.000] At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
[00:00:17.000 --> 00:00:23.000] A short time later, debris was seen falling from the skies above Texas.
[00:00:23.000 --> 00:00:29.000] The Columbia's lost. There are no survivors.
[00:00:29.000 --> 00:00:32.000] On board was a crew of seven.
[00:00:32.000 --> 00:00:39.000] Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
[00:00:39.000 --> 00:00:48.000] Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
[00:00:48.000 --> 00:00:52.000] a colonel in the Israeli Air Force.
[00:00:52.000 --> 00:00:58.000] These men and women assumed great risk in the service to all humanity.
[00:00:58.000 --> 00:01:03.000] In an age when space flight has come to seem almost routine,
[00:01:03.000 --> 00:01:07.000] it is easy to overlook the dangers of travel by rocket
[00:01:07.000 --> 00:01:12.000] and the difficulties of navigating the fierce outer atmosphere of the Earth.
[00:01:12.000 --> 00:01:18.000] These astronauts knew the dangers, and they faced them willingly,
[00:01:18.000 --> 00:01:23.000] knowing they had a high and noble purpose in life.
[00:01:23.000 --> 00:01:31.000] Because of their courage and daring and idealism, we will miss them all the more.
[00:01:31.000 --> 00:01:36.000] All Americans today are thinking as well of the families of these men and women
[00:01:36.000 --> 00:01:40.000] who have been given this sudden shock and grief.
[00:01:40.000 --> 00:01:45.000] You're not alone. Our entire nation grieves with you,
[00:01:45.000 --> 00:01:52.000] and those you love will always have the respect and gratitude of this country.
[00:01:52.000 --> 00:01:56.000] The cause in which they died will continue.
[00:01:56.000 --> 00:02:04.000] Mankind is led into the darkness beyond our world by the inspiration of discovery
[00:02:04.000 --> 00:02:11.000] and the longing to understand. Our journey into space will go on.
[00:02:11.000 --> 00:02:16.000] In the skies today, we saw destruction and tragedy.
[00:02:16.000 --> 00:02:22.000] Yet farther than we can see, there is comfort and hope.
[00:02:22.000 --> 00:02:29.000] In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
[00:02:29.000 --> 00:02:35.000] who created all these. He who brings out the starry hosts one by one
[00:02:35.000 --> 00:02:39.000] and calls them each by name."
[00:02:39.000 --> 00:02:46.000] Because of His great power and mighty strength, not one of them is missing.
[00:02:46.000 --> 00:02:55.000] The same Creator who names the stars also knows the names of the seven souls we mourn today.
[00:02:55.000 --> 00:03:01.000] The crew of the shuttle Columbia did not return safely to earth,
[00:03:01.000 --> 00:03:05.000] yet we can pray that all are safely home.
[00:03:05.000 --> 00:03:13.000] May God bless the grieving families, and may God continue to bless America.
[00:03:13.000 --> 00:03:19.000] [Silence]
whisper_print_timings: fallbacks = 1 p / 0 h
whisper_print_timings: load time = 569.03 ms
whisper_print_timings: mel time = 146.85 ms
whisper_print_timings: sample time = 238.66 ms / 553 runs ( 0.43 ms per run)
whisper_print_timings: encode time = 18665.10 ms / 9 runs ( 2073.90 ms per run)
whisper_print_timings: decode time = 13090.93 ms / 549 runs ( 23.85 ms per run)
whisper_print_timings: total time = 32733.52 ms
```
</details>
## Real-time audio input example
This is a naive example of performing real-time inference on audio from your microphone.
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```bash
cmake -B build -DWHISPER_SDL2=ON
cmake --build build --config Release
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```java
make stream
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
@ -443,18 +328,14 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
to highlight words with high or low confidence:
```bash
./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
```
<img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
## Controlling the length of the generated text segments (experimental)
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
```text
$ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
@ -473,12 +354,12 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
[00:00:10.020 --> 00:00:11.000] country.
```
## Word-level timestamp (experimental)
## Word-level timestamp
The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
```text
$ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
@ -486,7 +367,7 @@ system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:00.320]
[00:00:00.000 --> 00:00:00.320]
[00:00:00.320 --> 00:00:00.370] And
[00:00:00.370 --> 00:00:00.690] so
[00:00:00.690 --> 00:00:00.850] my
@ -514,42 +395,16 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
[00:00:10.510 --> 00:00:11.000] .
```
## Speaker segmentation via tinydiarize (experimental)
More information about this approach is available here: https://github.com/ggerganov/whisper.cpp/pull/1058
Sample usage:
```py
# download a tinydiarize compatible model
./models/download-ggml-model.sh small.en-tdrz
# run as usual, adding the "-tdrz" command-line argument
./build/bin/whisper-cli -f ./samples/a13.wav -m ./models/ggml-small.en-tdrz.bin -tdrz
...
main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, tdrz = 1, timestamps = 1 ...
...
[00:00:00.000 --> 00:00:03.800] Okay Houston, we've had a problem here. [SPEAKER_TURN]
[00:00:03.800 --> 00:00:06.200] This is Houston. Say again please. [SPEAKER_TURN]
[00:00:06.200 --> 00:00:08.260] Uh Houston we've had a problem.
[00:00:08.260 --> 00:00:11.320] We've had a main beam up on a volt. [SPEAKER_TURN]
[00:00:11.320 --> 00:00:13.820] Roger main beam interval. [SPEAKER_TURN]
[00:00:13.820 --> 00:00:15.100] Uh uh [SPEAKER_TURN]
[00:00:15.100 --> 00:00:18.020] So okay stand, by thirteen we're looking at it. [SPEAKER_TURN]
[00:00:18.020 --> 00:00:25.740] Okay uh right now uh Houston the uh voltage is uh is looking good um.
[00:00:27.620 --> 00:00:29.940] And we had a a pretty large bank or so.
```
## Karaoke-style movie generation (experimental)
The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
The [main](examples/main) example provides support for output of karaoke-style movies, where the
currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
This requires to have `ffmpeg` installed.
Here are a few _"typical"_ examples:
Here are a few *"typical"* examples:
```bash
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
source ./samples/jfk.wav.wts
ffplay ./samples/jfk.wav.mp4
```
@ -558,8 +413,8 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b
---
```bash
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
```java
./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
source ./samples/mm0.wav.wts
ffplay ./samples/mm0.wav.mp4
```
@ -568,8 +423,8 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9
---
```bash
./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
```java
./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
source ./samples/gb0.wav.wts
ffplay ./samples/gb0.wav.mp4
```
@ -578,40 +433,15 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
---
## Video comparison of different models
Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
```bash
./scripts/bench-wts.sh samples/jfk.wav
ffplay ./samples/jfk.wav.all.mp4
```
https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
---
## Benchmarks
In order to have an objective comparison of the performance of the inference across different system configurations,
use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
use the [bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
took to execute it. The results are summarized in the following Github issue:
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
You can run it with the following command, by default it will run against any standard model in the models folder.
```bash
python3 scripts/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
```
It is written in python with the intention of being easy to modify and extend for your benchmarking use case.
It outputs a csv file with the results of the benchmarking.
## `ggml` format
## ggml format
The original models are converted to a custom binary format. This allows to pack everything needed into a single file:
@ -623,54 +453,43 @@ The original models are converted to a custom binary format. This allows to pack
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
or manually from here:
- https://huggingface.co/ggerganov/whisper.cpp
- https://huggingface.co/datasets/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
in [models](models).
## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [x] Java:
- [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
## Examples
There are various examples of using the library for different projects in the [examples](examples) folder.
Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
| Example | Web | Description |
| --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
| [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
| [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
| [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
| [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
| [whisper-server](examples/server) | | HTTP transcription server with OAI-like API |
| [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
| Example | Web | Description |
| --- | --- | --- |
| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)

View File

@ -1,249 +0,0 @@
# whisper.cpp for SYCL
[Background](#background)
[OS](#os)
[Intel GPU](#intel-gpu)
[Linux](#linux)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Todo](#todo)
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<72>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The whisper.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04|
|Windows|Ongoing| |
## Intel GPU
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
## Linux
### Setup Environment
1. Install Intel GPU driver.
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Note: re-login to enable it.
c. Check
```
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel<65> oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel<65> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Check
```
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
```
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
#for FP32
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v
```
or
```
./examples/sycl/build.sh
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
Run without parameter:
```
./build/bin/ls-sycl-device
or
./build/bin/main
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute whisper.cpp
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
```
or run by script:
```
./examples/sycl/run_whisper.sh
```
5. Check the device ID in output
Like:
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
#### Build
|Name|Value|Function|
|-|-|-|
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
#### Running
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
## Known Issue
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**.
## Todo
- Support to build in Windows.
- Support multiple cards.

View File

@ -1,47 +1,18 @@
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
GGML_METAL_PATH_RESOURCES := $(abspath ../..)
BUILD_DIR := build_go
BUILD_DIR := build
MODELS_DIR := models
EXAMPLES_DIR := $(wildcard examples/*)
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))
ifeq ($(GGML_CUDA),1)
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
endif
ifeq ($(UNAME_S),Darwin)
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
endif
INCLUDE_PATH := $(abspath ../..)
LIBRARY_PATH := $(abspath ../..)
all: clean whisper examples
whisper: mkdir
cmake -S ../.. -B ../../${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF
cmake --build ../../${BUILD_DIR} --target whisper
@echo Build whisper
@${MAKE} -C ../.. libwhisper.a
test: model-small whisper modtidy
ifeq ($(UNAME_S),Darwin)
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v .
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v ./pkg/whisper/...
else
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
endif
examples: $(EXAMPLES_DIR)
@ -50,11 +21,7 @@ model-small: mkdir examples/go-model-download
$(EXAMPLES_DIR): mkdir whisper modtidy
@echo Build example $(notdir $@)
ifeq ($(UNAME_S),Darwin)
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go build ${BUILD_FLAGS} -ldflags "-extldflags '$(EXT_LDFLAGS)'" -o ${BUILD_DIR}/$(notdir $@) ./$@
else
@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
endif
mkdir:
@echo Mkdir ${BUILD_DIR}
@ -65,7 +32,7 @@ mkdir:
modtidy:
@go mod tidy
clean:
clean:
@echo Clean
@rm -fr $(BUILD_DIR)
@go clean

View File

@ -31,7 +31,7 @@ func main() {
if err != nil {
panic(err)
}
if err := context.Process(samples, nil, nil, nil); err != nil {
if err := context.Process(samples, nil); err != nil {
return err
}
@ -62,12 +62,6 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
make examples
```
To build using cuda support add `GGML_CUDA=1`:
```bash
GGML_CUDA=1 make examples
```
The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
```bash
@ -77,7 +71,7 @@ The examples are placed in the `build` directory. Once built, you can download a
And you can then test a model against samples with the following command:
```bash
./build/go-whisper -model models/ggml-tiny.en.bin samples/jfk.wav
./build/go-whisper -model models/ggml-tiny.en.bin samples/jfk.wav
```
## Using the bindings

View File

@ -9,23 +9,22 @@ import (
// ContextForSignal returns a context object which is cancelled when a signal
// is received. It returns nil if no signal parameter is provided
func ContextForSignal(signals ...os.Signal) context.Context {
if len(signals) == 0 {
return nil
}
if len(signals) == 0 {
return nil
}
ch := make(chan os.Signal, 1) // Buffered channel with space for 1 signal
ctx, cancel := context.WithCancel(context.Background())
ch := make(chan os.Signal)
ctx, cancel := context.WithCancel(context.Background())
// Send message on channel when signal received
signal.Notify(ch, signals...)
// Send message on channel when signal received
signal.Notify(ch, signals...)
// When any signal is received, call cancel
go func() {
<-ch
cancel()
}()
// When any signal received, call cancel
go func() {
<-ch
cancel()
}()
// Return success
return ctx
// Return success
return ctx
}

View File

@ -9,7 +9,6 @@ import (
"net/url"
"os"
"path/filepath"
"strings"
"syscall"
"time"
)
@ -18,27 +17,14 @@ import (
// CONSTANTS
const (
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/" // The location of the models
srcExt = ".bin" // Filename extension
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
srcUrl = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
srcExt = ".bin" // Filename extension
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
)
var (
// The models which will be downloaded, if no model is specified as an argument
modelNames = []string{
"tiny", "tiny-q5_1", "tiny-q8_0",
"tiny.en", "tiny.en-q5_1", "tiny.en-q8_0",
"base", "base-q5_1", "base-q8_0",
"base.en", "base.en-q5_1", "base.en-q8_0",
"small", "small-q5_1", "small-q8_0",
"small.en", "small.en-q5_1", "small.en-q8_0",
"medium", "medium-q5_0", "medium-q8_0",
"medium.en", "medium.en-q5_0", "medium.en-q8_0",
"large-v1",
"large-v2", "large-v2-q5_0", "large-v2-q8_0",
"large-v3", "large-v3-q5_0",
"large-v3-turbo", "large-v3-turbo-q5_0", "large-v3-turbo-q8_0",
}
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
)
var (
@ -58,25 +44,7 @@ var (
func main() {
flag.Usage = func() {
name := filepath.Base(flag.CommandLine.Name())
fmt.Fprintf(flag.CommandLine.Output(), `
Usage: %s [options] [<model>...]
Options:
-out string Specify the output folder where models will be saved.
Default: Current working directory.
-timeout duration Set the maximum duration for downloading a model.
Example: 10m, 1h (default: 30m0s).
-quiet Suppress all output except errors.
Examples:
1. Download a specific model:
%s -out ./models tiny-q8_0
2. Download all models:
%s -out ./models
`, name, name, name)
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [options] <model>\n\n", name)
flag.PrintDefaults()
}
flag.Parse()
@ -146,87 +114,23 @@ func GetOut() (string, error) {
// GetModels returns the list of models to download
func GetModels() []string {
if flag.NArg() == 0 {
fmt.Println("No model specified.")
fmt.Println("Preparing to download all models...")
// Calculate total download size
fmt.Println("Calculating total download size...")
totalSize, err := CalculateTotalDownloadSize(modelNames)
if err != nil {
fmt.Println("Error calculating download sizes:", err)
os.Exit(1)
}
fmt.Println("View available models: https://huggingface.co/ggerganov/whisper.cpp/tree/main")
fmt.Printf("Total download size: %.2f GB\n", float64(totalSize)/(1024*1024*1024))
fmt.Println("Would you like to download all models? (y/N)")
// Prompt for user input
var response string
fmt.Scanln(&response)
if response != "y" && response != "Y" {
fmt.Println("Aborting. Specify a model to download.")
os.Exit(0)
}
return modelNames // Return all models if confirmed
return modelNames
} else {
return flag.Args()
}
return flag.Args() // Return specific models if arguments are provided
}
func CalculateTotalDownloadSize(models []string) (int64, error) {
var totalSize int64
client := http.Client{}
for _, model := range models {
modelURL, err := URLForModel(model)
if err != nil {
return 0, err
}
// Issue a HEAD request to get the file size
req, err := http.NewRequest("HEAD", modelURL, nil)
if err != nil {
return 0, err
}
resp, err := client.Do(req)
if err != nil {
return 0, err
}
resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Printf("Warning: Unable to fetch size for %s (HTTP %d)\n", model, resp.StatusCode)
continue
}
size := resp.ContentLength
totalSize += size
}
return totalSize, nil
}
// URLForModel returns the URL for the given model on huggingface.co
func URLForModel(model string) (string, error) {
// Ensure "ggml-" prefix is added only once
if !strings.HasPrefix(model, "ggml-") {
model = "ggml-" + model
}
// Ensure ".bin" extension is added only once
if filepath.Ext(model) != srcExt {
model += srcExt
}
// Parse the base URL
url, err := url.Parse(srcUrl)
if err != nil {
return "", err
} else {
url.Path = filepath.Join(url.Path, model)
}
// Ensure no trailing slash in the base URL
url.Path = fmt.Sprintf("%s/%s", strings.TrimSuffix(url.Path, "/"), model)
return url.String(), nil
}

View File

@ -68,6 +68,10 @@ func (flags *Flags) GetOut() string {
return strings.ToLower(flags.Lookup("out").Value.String())
}
func (flags *Flags) IsSpeedup() bool {
return flags.Lookup("speedup").Value.String() == "true"
}
func (flags *Flags) IsTokens() bool {
return flags.Lookup("tokens").Value.String() == "true"
}
@ -107,6 +111,10 @@ func (flags *Flags) SetParams(context whisper.Context) error {
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
context.SetDuration(duration)
}
if flags.IsSpeedup() {
fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
context.SetSpeedup(true)
}
if threads := flags.GetThreads(); threads != 0 {
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
context.SetThreads(threads)
@ -138,6 +146,7 @@ func registerFlags(flag *Flags) {
flag.Duration("offset", 0, "Time offset")
flag.Duration("duration", 0, "Duration of audio to process")
flag.Uint("threads", 0, "Number of threads to use")
flag.Bool("speedup", false, "Enable speedup")
flag.Uint("max-len", 0, "Maximum segment length in characters")
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
flag.Float64("word-thold", 0, "Maximum segment score")

View File

@ -67,7 +67,7 @@ func Process(model whisper.Model, path string, flags *Flags) error {
// Process the data
fmt.Fprintf(flags.Output(), " ...processing %q\n", path)
context.ResetTimings()
if err := context.Process(data, nil, cb, nil); err != nil {
if err := context.Process(data, cb); err != nil {
return err
}

View File

@ -1,10 +1,10 @@
module github.com/ggerganov/whisper.cpp/bindings/go
go 1.23
go 1.19
require (
github.com/go-audio/wav v1.1.0
github.com/stretchr/testify v1.9.0
github.com/stretchr/testify v1.8.1
)
require (

View File

@ -1,3 +1,4 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -8,9 +9,15 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -19,10 +19,6 @@ func (p *Params) SetTranslate(v bool) {
p.translate = toBool(v)
}
func (p *Params) SetSplitOnWord(v bool) {
p.split_on_word = toBool(v)
}
func (p *Params) SetNoContext(v bool) {
p.no_context = toBool(v)
}
@ -47,6 +43,10 @@ func (p *Params) SetPrintTimestamps(v bool) {
p.print_timestamps = toBool(v)
}
func (p *Params) SetSpeedup(v bool) {
p.speed_up = toBool(v)
}
// Set language id
func (p *Params) SetLanguage(lang int) error {
if lang == -1 {
@ -105,47 +105,11 @@ func (p *Params) SetMaxSegmentLength(n int) {
p.max_len = C.int(n)
}
func (p *Params) SetTokenTimestamps(b bool) {
p.token_timestamps = toBool(b)
}
// Set max tokens per segment (0 = no limit)
func (p *Params) SetMaxTokensPerSegment(n int) {
p.max_tokens = C.int(n)
}
// Set audio encoder context
func (p *Params) SetAudioCtx(n int) {
p.audio_ctx = C.int(n)
}
func (p *Params) SetMaxContext(n int) {
p.n_max_text_ctx = C.int(n)
}
func (p *Params) SetBeamSize(n int) {
p.beam_search.beam_size = C.int(n)
}
func (p *Params) SetEntropyThold(t float32) {
p.entropy_thold = C.float(t)
}
func (p *Params) SetTemperature(t float32) {
p.temperature = C.float(t)
}
// Sets the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (p *Params) SetTemperatureFallback(t float32) {
p.temperature_inc = C.float(t)
}
// Set initial prompt
func (p *Params) SetInitialPrompt(prompt string) {
p.initial_prompt = C.CString(prompt)
}
///////////////////////////////////////////////////////////////////////////////
// PRIVATE METHODS
@ -169,12 +133,6 @@ func (p *Params) String() string {
str += fmt.Sprintf(" n_max_text_ctx=%d", p.n_max_text_ctx)
str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
str += fmt.Sprintf(" temperature=%f", p.temperature)
str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
if p.translate {
str += " translate"
}
@ -199,6 +157,9 @@ func (p *Params) String() string {
if p.token_timestamps {
str += " token_timestamps"
}
if p.speed_up {
str += " speed_up"
}
return str + ">"
}

View File

@ -71,17 +71,14 @@ func (context *context) Language() string {
return whisper.Whisper_lang_str(context.params.Language())
}
func (context *context) DetectedLanguage() string {
return whisper.Whisper_lang_str(context.model.ctx.Whisper_full_lang_id())
}
// Set translate flag
func (context *context) SetTranslate(v bool) {
context.params.SetTranslate(v)
}
func (context *context) SetSplitOnWord(v bool) {
context.params.SetSplitOnWord(v)
// Set speedup flag
func (context *context) SetSpeedup(v bool) {
context.params.SetSpeedup(v)
}
// Set number of threads to use
@ -96,7 +93,7 @@ func (context *context) SetOffset(v time.Duration) {
// Set duration of audio to process
func (context *context) SetDuration(v time.Duration) {
context.params.SetDuration(int(v.Milliseconds()))
context.params.SetOffset(int(v.Milliseconds()))
}
// Set timestamp token probability threshold (~0.01)
@ -114,52 +111,11 @@ func (context *context) SetMaxSegmentLength(n uint) {
context.params.SetMaxSegmentLength(int(n))
}
// Set token timestamps flag
func (context *context) SetTokenTimestamps(b bool) {
context.params.SetTokenTimestamps(b)
}
// Set max tokens per segment (0 = no limit)
func (context *context) SetMaxTokensPerSegment(n uint) {
context.params.SetMaxTokensPerSegment(int(n))
}
// Set audio encoder context
func (context *context) SetAudioCtx(n uint) {
context.params.SetAudioCtx(int(n))
}
// Set maximum number of text context tokens to store
func (context *context) SetMaxContext(n int) {
context.params.SetMaxContext(n)
}
// Set Beam Size
func (context *context) SetBeamSize(n int) {
context.params.SetBeamSize(n)
}
// Set Entropy threshold
func (context *context) SetEntropyThold(t float32) {
context.params.SetEntropyThold(t)
}
// Set Temperature
func (context *context) SetTemperature(t float32) {
context.params.SetTemperature(t)
}
// Set the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (context *context) SetTemperatureFallback(t float32) {
context.params.SetTemperatureFallback(t)
}
// Set initial prompt
func (context *context) SetInitialPrompt(prompt string) {
context.params.SetInitialPrompt(prompt)
}
// ResetTimings resets the mode timings. Should be called before processing
func (context *context) ResetTimings() {
context.model.ctx.Whisper_reset_timings()
@ -191,49 +147,38 @@ func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]f
}
// Process new sample data and return any errors
func (context *context) Process(
data []float32,
callEncoderBegin EncoderBeginCallback,
callNewSegment SegmentCallback,
callProgress ProgressCallback,
) error {
func (context *context) Process(data []float32, cb SegmentCallback) error {
if context.model.ctx == nil {
return ErrInternalAppError
}
// If the callback is defined then we force on single_segment mode
if callNewSegment != nil {
if cb != nil {
context.params.SetSingleSegment(true)
}
// We don't do parallel processing at the moment
processors := 0
if processors > 1 {
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, callEncoderBegin,
func(new int) {
if callNewSegment != nil {
num_segments := context.model.ctx.Whisper_full_n_segments()
s0 := num_segments - new
for i := s0; i < num_segments; i++ {
callNewSegment(toSegment(context.model.ctx, i))
}
}
}); err != nil {
return err
}
} else if err := context.model.ctx.Whisper_full(context.params, data, callEncoderBegin,
func(new int) {
if callNewSegment != nil {
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
if cb != nil {
num_segments := context.model.ctx.Whisper_full_n_segments()
s0 := num_segments - new
for i := s0; i < num_segments; i++ {
callNewSegment(toSegment(context.model.ctx, i))
cb(toSegment(context.model.ctx, i))
}
}
}, func(progress int) {
if callProgress != nil {
callProgress(progress)
}
}); err != nil {
return err
}
} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
if cb != nil {
num_segments := context.model.ctx.Whisper_full_n_segments()
s0 := num_segments - new
for i := s0; i < num_segments; i++ {
cb(toSegment(context.model.ctx, i))
}
}
}); err != nil {
return err
}
@ -335,14 +280,10 @@ func toSegment(ctx *whisper.Context, n int) Segment {
func toTokens(ctx *whisper.Context, n int) []Token {
result := make([]Token, ctx.Whisper_full_n_tokens(n))
for i := 0; i < len(result); i++ {
data := ctx.Whisper_full_get_token_data(n, i)
result[i] = Token{
Id: int(ctx.Whisper_full_get_token_id(n, i)),
Text: ctx.Whisper_full_get_token_text(n, i),
P: ctx.Whisper_full_get_token_p(n, i),
Start: time.Duration(data.T0()) * time.Millisecond * 10,
End: time.Duration(data.T1()) * time.Millisecond * 10,
Id: int(ctx.Whisper_full_get_token_id(n, i)),
Text: strings.TrimSpace(ctx.Whisper_full_get_token_text(n, i)),
P: ctx.Whisper_full_get_token_p(n, i),
}
}
return result

View File

@ -4,121 +4,52 @@ import (
"os"
"testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
"github.com/go-audio/wav"
// Packages
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert"
)
func TestSetLanguage(t *testing.T) {
assert := assert.New(t)
const (
ModelPath = "../../models/ggml-tiny.bin"
SamplePath = "../../samples/jfk.wav"
)
func Test_Whisper_000(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
assert.NoError(model.Close())
t.Log("languages=", model.Languages())
}
func Test_Whisper_001(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
// Get context for decoding
ctx, err := model.NewContext()
assert.NoError(err)
assert.NotNil(ctx)
// This returns an error since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
err = context.SetLanguage("en")
assert.Error(err)
}
func TestContextModelIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
isMultilingual := context.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguage(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
// This always returns en since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
expectedLanguage := "en"
actualLanguage := context.Language()
assert.Equal(expectedLanguage, actualLanguage)
}
func TestProcess(t *testing.T) {
assert := assert.New(t)
fh, err := os.Open(SamplePath)
assert.NoError(err)
defer fh.Close()
// Decode the WAV file - load the full buffer
dec := wav.NewDecoder(fh)
buf, err := dec.FullPCMBuffer()
assert.NoError(err)
assert.Equal(uint16(1), dec.NumChans)
data := buf.AsFloat32Buffer().Data
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
err = context.Process(data, nil, nil, nil)
assert.NoError(err)
}
func TestDetectedLanguage(t *testing.T) {
assert := assert.New(t)
fh, err := os.Open(SamplePath)
assert.NoError(err)
defer fh.Close()
// Decode the WAV file - load the full buffer
dec := wav.NewDecoder(fh)
buf, err := dec.FullPCMBuffer()
assert.NoError(err)
assert.Equal(uint16(1), dec.NumChans)
data := buf.AsFloat32Buffer().Data
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
err = context.Process(data, nil, nil, nil)
assert.NoError(err)
expectedLanguage := "en"
actualLanguage := context.DetectedLanguage()
assert.Equal(expectedLanguage, actualLanguage)
}

View File

@ -12,14 +12,6 @@ import (
// time. It is called during the Process function
type SegmentCallback func(Segment)
// ProgressCallback is the callback function for reporting progress during
// processing. It is called during the Process function
type ProgressCallback func(int)
// EncoderBeginCallback is the callback function for checking if we want to
// continue processing. It is called during the Process function
type EncoderBeginCallback func() bool
// Model is the interface to a whisper model. Create a new model with the
// function whisper.New(string)
type Model interface {
@ -35,35 +27,26 @@ type Model interface {
Languages() []string
}
// Context is the speech recognition context.
// Context is the speach recognition context.
type Context interface {
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
SetTranslate(bool) // Set translate flag
IsMultilingual() bool // Return true if the model is multilingual.
Language() string // Get language
DetectedLanguage() string // Get detected language
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context
SetMaxContext(n int) // Set maximum number of text context tokens to store
SetBeamSize(n int) // Set Beam Size
SetEntropyThold(t float32) // Set Entropy threshold
SetInitialPrompt(prompt string) // Set initial prompt
SetTemperature(t float32) // Set temperature
SetTemperatureFallback(t float32) // Set temperature incrementation
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
// Process mono audio data and return any errors.
// If defined, newly generated segments are passed to the
// callback function during processing.
Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error
Process([]float32, SegmentCallback) error
// After process is called, return segments until the end of the stream
// is reached, when io.EOF is returned.
@ -102,8 +85,7 @@ type Segment struct {
// Token is a text or special token
type Token struct {
Id int
Text string
P float32
Start, End time.Duration
Id int
Text string
P float32
}

View File

@ -94,7 +94,6 @@ func (model *model) NewContext() (Context, error) {
params.SetPrintRealtime(false)
params.SetPrintTimestamps(false)
params.SetThreads(runtime.NumCPU())
params.SetNoContext(true)
// Return new context
return newContext(model, params)

View File

@ -1,91 +0,0 @@
package whisper_test
import (
"testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert"
)
func TestNew(t *testing.T) {
assert := assert.New(t)
t.Run("valid model path", func(t *testing.T) {
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
})
t.Run("invalid model path", func(t *testing.T) {
invalidModelPath := "invalid-model-path.bin"
model, err := whisper.New(invalidModelPath)
assert.Error(err)
assert.Nil(model)
})
}
func TestClose(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
err = model.Close()
assert.NoError(err)
}
func TestNewContext(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
assert.NotNil(context)
}
func TestIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
isMultilingual := model.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguages(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
expectedLanguages := []string{
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
}
actualLanguages := model.Languages()
assert.Equal(expectedLanguages, actualLanguages)
}

View File

@ -1,6 +0,0 @@
package whisper_test
const (
ModelPath = "../../models/ggml-small.en.bin"
SamplePath = "../../samples/jfk.wav"
)

View File

@ -9,37 +9,27 @@ import (
// CGO
/*
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
#cgo LDFLAGS: -lwhisper -lm -lstdc++
#cgo darwin LDFLAGS: -framework Accelerate
#include <whisper.h>
#include <stdlib.h>
extern void callNewSegment(void* user_data, int new);
extern void callProgress(void* user_data, int progress);
extern bool callEncoderBegin(void* user_data);
// Text segment callback
// Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments
static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
if(user_data != NULL && ctx != NULL) {
callNewSegment(user_data, n_new);
}
}
// Progress callback
// Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments
static void whisper_progress_cb(struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
if(user_data != NULL && ctx != NULL) {
callProgress(user_data, progress);
}
}
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
if(user_data != NULL && ctx != NULL) {
return callEncoderBegin(user_data);
}
@ -53,8 +43,6 @@ static struct whisper_full_params whisper_full_default_params_cb(struct whisper_
params.new_segment_callback_user_data = (void*)(ctx);
params.encoder_begin_callback = whisper_encoder_begin_cb;
params.encoder_begin_callback_user_data = (void*)(ctx);
params.progress_callback = whisper_progress_cb;
params.progress_callback_user_data = (void*)(ctx);
return params;
}
*/
@ -83,6 +71,7 @@ const (
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
NumFFT = C.WHISPER_N_FFT
NumMEL = C.WHISPER_N_MEL
HopLength = C.WHISPER_HOP_LENGTH
ChunkSize = C.WHISPER_CHUNK_SIZE
)
@ -102,7 +91,7 @@ var (
func Whisper_init(path string) *Context {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
if ctx := C.whisper_init_from_file_with_params(cPath, C.whisper_context_default_params()); ctx != nil {
if ctx := C.whisper_init_from_file(cPath); ctx != nil {
return (*Context)(ctx)
} else {
return nil
@ -269,13 +258,13 @@ func (ctx *Context) Whisper_token_lang(lang_id int) Token {
}
// Task tokens
func (ctx *Context) Whisper_token_translate() Token {
return Token(C.whisper_token_translate((*C.struct_whisper_context)(ctx)))
func Whisper_token_translate() Token {
return Token(C.whisper_token_translate())
}
// Task tokens
func (ctx *Context) Whisper_token_transcribe() Token {
return Token(C.whisper_token_transcribe((*C.struct_whisper_context)(ctx)))
func Whisper_token_transcribe() Token {
return Token(C.whisper_token_transcribe())
}
// Performance information
@ -301,19 +290,11 @@ func (ctx *Context) Whisper_full_default_params(strategy SamplingStrategy) Param
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
func (ctx *Context) Whisper_full(
params Params,
samples []float32,
encoderBeginCallback func() bool,
newSegmentCallback func(int),
progressCallback func(int),
) error {
func (ctx *Context) Whisper_full(params Params, samples []float32, encoderBeginCallback func() bool, newSegmentCallback func(int)) error {
registerEncoderBeginCallback(ctx, encoderBeginCallback)
registerNewSegmentCallback(ctx, newSegmentCallback)
registerProgressCallback(ctx, progressCallback)
defer registerEncoderBeginCallback(ctx, nil)
defer registerNewSegmentCallback(ctx, nil)
defer registerProgressCallback(ctx, nil)
if C.whisper_full((*C.struct_whisper_context)(ctx), (C.struct_whisper_full_params)(params), (*C.float)(&samples[0]), C.int(len(samples))) == 0 {
return nil
} else {
@ -337,18 +318,6 @@ func (ctx *Context) Whisper_full_parallel(params Params, samples []float32, proc
}
}
// Return the id of the autodetected language, returns -1 if not found
// Added to whisper.cpp in
// https://github.com/ggerganov/whisper.cpp/commit/a1c1583cc7cd8b75222857afc936f0638c5683d6
//
// Examples:
//
// "de" -> 2
// "german" -> 2
func (ctx *Context) Whisper_full_lang_id() int {
return int(C.whisper_full_lang_id((*C.struct_whisper_context)(ctx)))
}
// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
func (ctx *Context) Whisper_full_n_segments() int {
@ -387,7 +356,7 @@ func (ctx *Context) Whisper_full_get_token_id(segment int, token int) Token {
// Get token data for the specified token in the specified segment.
// This contains probabilities, timestamps, etc.
func (ctx *Context) Whisper_full_get_token_data(segment int, token int) TokenData {
func (ctx *Context) whisper_full_get_token_data(segment int, token int) TokenData {
return TokenData(C.whisper_full_get_token_data((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
}
@ -401,7 +370,6 @@ func (ctx *Context) Whisper_full_get_token_p(segment int, token int) float32 {
var (
cbNewSegment = make(map[unsafe.Pointer]func(int))
cbProgress = make(map[unsafe.Pointer]func(int))
cbEncoderBegin = make(map[unsafe.Pointer]func() bool)
)
@ -413,14 +381,6 @@ func registerNewSegmentCallback(ctx *Context, fn func(int)) {
}
}
func registerProgressCallback(ctx *Context, fn func(int)) {
if fn == nil {
delete(cbProgress, unsafe.Pointer(ctx))
} else {
cbProgress[unsafe.Pointer(ctx)] = fn
}
}
func registerEncoderBeginCallback(ctx *Context, fn func() bool) {
if fn == nil {
delete(cbEncoderBegin, unsafe.Pointer(ctx))
@ -436,13 +396,6 @@ func callNewSegment(user_data unsafe.Pointer, new C.int) {
}
}
//export callProgress
func callProgress(user_data unsafe.Pointer, progress C.int) {
if fn, ok := cbProgress[user_data]; ok {
fn(int(progress))
}
}
//export callEncoderBegin
func callEncoderBegin(user_data unsafe.Pointer) C.bool {
if fn, ok := cbEncoderBegin[user_data]; ok {
@ -454,15 +407,3 @@ func callEncoderBegin(user_data unsafe.Pointer) C.bool {
}
return true
}
func (t TokenData) T0() int64 {
return int64(t.t0)
}
func (t TokenData) T1() int64 {
return int64(t.t1)
}
func (t TokenData) Id() Token {
return Token(t.id)
}

View File

@ -52,7 +52,7 @@ func Test_Whisper_001(t *testing.T) {
defer ctx.Whisper_free()
params := ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
data := buf.AsFloat32Buffer().Data
err = ctx.Whisper_full(params, data, nil, nil, nil)
err = ctx.Whisper_full(params, data, nil, nil)
assert.NoError(err)
// Print out tokens

1
bindings/ios Submodule

Submodule bindings/ios added at d5c6d5c8a3

View File

@ -1,124 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.svg" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>

View File

@ -1,71 +0,0 @@
# Java JNI bindings for Whisper
This package provides Java JNI bindings for whisper.cpp. They have been tested on:
* <strike>Darwin (OS X) 12.6 on x64_64</strike>
* Ubuntu on x86_64
* Windows on x86_64
The "low level" bindings are in `WhisperCppJnaLibrary`. The most simple usage is as follows:
JNA will attempt to load the `whispercpp` shared library from:
- jna.library.path
- jna.platform.library
- ~/Library/Frameworks
- /Library/Frameworks
- /System/Library/Frameworks
- classpath
```java
import io.github.ggerganov.whispercpp.WhisperCpp;
public class Example {
public static void main(String[] args) {
WhisperCpp whisper = new WhisperCpp();
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
// or you can provide the absolute path to the model file.
long context = whisper.initContext("base.en");
try {
var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
// custom configuration if required
whisperParams.temperature_inc = 0f;
var samples = readAudio(); // divide each value by 32767.0f
whisper.fullTranscribe(whisperParams, samples);
int segmentCount = whisper.getTextSegmentCount(context);
for (int i = 0; i < segmentCount; i++) {
String text = whisper.getTextSegment(context, i);
System.out.println(segment.getText());
}
} finally {
whisper.freeContext(context);
}
}
}
```
## Building & Testing
In order to build, you need to have the JDK 8 or higher installed. Run the tests with:
```bash
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp/bindings/java
./gradlew build
```
You need to have the `whisper` library in your [JNA library path](https://java-native-access.github.io/jna/4.2.1/com/sun/jna/NativeLibrary.html). On Windows the dll is included in the jar and you can update it:
```bash
copy /y ..\..\build\bin\Release\whisper.dll build\generated\resources\main\win32-x86-64\whisper.dll
```
## License
The license for the Java bindings is the same as the license for the rest of the whisper.cpp project, which is the MIT License. See the `LICENSE` file for more details.

View File

@ -1,141 +0,0 @@
plugins {
id 'java'
id 'java-library'
id 'maven-publish'
id 'signing'
}
archivesBaseName = 'whispercpp'
group = 'io.github.ggerganov'
version = '1.4.0'
sourceCompatibility = 1.8
targetCompatibility = 1.8
sourceSets {
main {
resources {
srcDirs = ['src/main/resources', 'build/generated/resources/main']
}
}
test {
runtimeClasspath += files('build/generated/resources/main')
}
}
tasks.register('copyLibwhisperDynlib', Copy) {
from '../../build/src'
include 'libwhisper.dylib'
into 'build/generated/resources/main/darwin'
}
tasks.register('copyLibwhisperSo', Copy) {
from '../../build/src'
include 'libwhisper.so'
into 'build/generated/resources/main/linux-x86-64'
}
tasks.register('copyWhisperDll', Copy) {
from '../../build/Release'
include 'whisper.dll'
into 'build/generated/resources/main/windows-x86-64'
}
tasks.register('copyLibs') {
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDll
}
test {
systemProperty 'jna.library.path', project.file('build/generated/resources/main').absolutePath
}
java {
withSourcesJar()
withJavadocJar()
}
sourcesJar() {
dependsOn copyLibs
}
jar {
dependsOn copyLibs
exclude '**/whisper_java.exp', '**/whisper_java.lib'
}
javadoc {
options.addStringOption('Xdoclint:none', '-quiet')
}
tasks.withType(Test) {
useJUnitPlatform()
}
test.dependsOn copyLibs
processResources.dependsOn copyLibs
dependencies {
implementation "net.java.dev.jna:jna:5.13.0"
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
testImplementation "org.assertj:assertj-core:3.24.2"
}
repositories {
mavenCentral()
}
publishing {
publications {
mavenJava(MavenPublication) {
artifactId = 'whispercpp'
from components.java
pom {
name = 'whispercpp'
description = "Java JNA bindings for OpenAI's Whisper model, implemented in C/C++"
url = 'https://github.com/ggerganov/whisper.cpp'
licenses {
license {
name = 'MIT licence'
url = 'https://raw.githubusercontent.com/ggerganov/whisper.cpp/master/LICENSE'
}
}
developers {
developer {
id = 'ggerganov'
name = 'Georgi Gerganov'
email = 'ggerganov@gmail.com'
}
developer {
id = 'nalbion'
name = 'Nicholas Albion'
email = 'nalbion@yahoo.com'
}
}
scm {
connection = 'scm:git:git://github.com/ggerganov/whisper.cpp.git'
url = 'https://github.com/ggerganov/whisper.cpp'
}
}
}
}
repositories {
maven {
def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
credentials {
username = System.getenv("MAVEN_USERNAME")
password = System.getenv("MAVEN_PASSWORD")
}
}
}
}
signing {
def signingKey = System.getenv("PGP_SECRET")
def signingPassword = System.getenv("PGP_PASSPHRASE")
useInMemoryPgpKeys(signingKey, signingPassword)
sign publishing.publications.mavenJava
}

View File

@ -1,6 +0,0 @@
org.gradle.jvmargs=-Xms256m -Xmx1024m
system.include.dir=/usr/include
#system.local.include.dir=../../include
system.local.include.dir=./build/generated/sources/headers/java/main
jni.include.dir=/usr/lib/jvm/java-8-openjdk-amd64/include/
jni.lib.dir=/usr/lib/jvm/java-8-openjdk-amd64/lib/

Binary file not shown.

View File

@ -1,6 +0,0 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.1-bin.zip
networkTimeout=10000
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

244
bindings/java/gradlew vendored
View File

@ -1,244 +0,0 @@
#!/bin/sh
#
# Copyright © 2015-2021 the original authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
#
# Gradle start up script for POSIX generated by Gradle.
#
# Important for running:
#
# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
# noncompliant, but you have some other compliant shell such as ksh or
# bash, then to run this script, type that shell name before the whole
# command line, like:
#
# ksh Gradle
#
# Busybox and similar reduced shells will NOT work, because this script
# requires all of these POSIX shell features:
# * functions;
# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
# * compound commands having a testable exit status, especially «case»;
# * various built-in commands including «command», «set», and «ulimit».
#
# Important for patching:
#
# (2) This script targets any POSIX shell, so it avoids extensions provided
# by Bash, Ksh, etc; in particular arrays are avoided.
#
# The "traditional" practice of packing multiple parameters into a
# space-separated string is a well documented source of bugs and security
# problems, so this is (mostly) avoided, by progressively accumulating
# options in "$@", and eventually passing that to Java.
#
# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
# see the in-line comments for details.
#
# There are tweaks for specific operating systems such as AIX, CygWin,
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
#
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
app_path=$0
# Need this for daisy-chained symlinks.
while
APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
[ -h "$app_path" ]
do
ls=$( ls -ld "$app_path" )
link=${ls#*' -> '}
case $link in #(
/*) app_path=$link ;; #(
*) app_path=$APP_HOME$link ;;
esac
done
# This is normally unused
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
warn () {
echo "$*"
} >&2
die () {
echo
echo "$*"
echo
exit 1
} >&2
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "$( uname )" in #(
CYGWIN* ) cygwin=true ;; #(
Darwin* ) darwin=true ;; #(
MSYS* | MINGW* ) msys=true ;; #(
NONSTOP* ) nonstop=true ;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD=$JAVA_HOME/jre/sh/java
else
JAVACMD=$JAVA_HOME/bin/java
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD=java
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
# In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
# In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
fi
# Collect all arguments for the java command, stacking in reverse order:
# * args from the command line
# * the main class name
# * -classpath
# * -D...appname settings
# * --module-path (only if needed)
# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
# For Cygwin or MSYS, switch paths to Windows format before running java
if "$cygwin" || "$msys" ; then
APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
JAVACMD=$( cygpath --unix "$JAVACMD" )
# Now convert the arguments - kludge to limit ourselves to /bin/sh
for arg do
if
case $arg in #(
-*) false ;; # don't mess with options #(
/?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
[ -e "$t" ] ;; #(
*) false ;;
esac
then
arg=$( cygpath --path --ignore --mixed "$arg" )
fi
# Roll the args list around exactly as many times as the number of
# args, so each arg winds up back in the position where it started, but
# possibly modified.
#
# NB: a `for` loop captures its iteration list before it begins, so
# changing the positional parameters here affects neither the number of
# iterations, nor the values presented in `arg`.
shift # remove old arg
set -- "$@" "$arg" # push replacement arg
done
fi
# Collect all arguments for the java command;
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
# shell script including quotes and variable substitutions, so put them in
# double quotes to make sure that they get re-expanded; and
# * put everything else in single quotes, so that it's not re-expanded.
set -- \
"-Dorg.gradle.appname=$APP_BASE_NAME" \
-classpath "$CLASSPATH" \
org.gradle.wrapper.GradleWrapperMain \
"$@"
# Stop when "xargs" is not available.
if ! command -v xargs >/dev/null 2>&1
then
die "xargs is not available"
fi
# Use "xargs" to parse quoted args.
#
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
#
# In Bash we could simply go:
#
# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
# set -- "${ARGS[@]}" "$@"
#
# but POSIX shell has neither arrays nor command substitution, so instead we
# post-process each arg (as a line of input to sed) to backslash-escape any
# character that might be a shell metacharacter, then use eval to reverse
# that process (while maintaining the separation between arguments), and wrap
# the whole thing up as a single "set" statement.
#
# This will of course break if any of these variables contains a newline or
# an unmatched quote.
#
eval "set -- $(
printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
xargs -n1 |
sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
tr '\n' ' '
)" '"$@"'
exec "$JAVACMD" "$@"

View File

@ -1,92 +0,0 @@
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%"=="" set DIRNAME=.
@rem This is normally unused
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if %ERRORLEVEL% equ 0 goto execute
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto execute
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
:end
@rem End local scope for the variables with windows NT shell
if %ERRORLEVEL% equ 0 goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
set EXIT_CODE=%ERRORLEVEL%
if %EXIT_CODE% equ 0 set EXIT_CODE=1
if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
exit /b %EXIT_CODE%
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega

View File

@ -1 +0,0 @@
rootProject.name = "whispercpp"

View File

@ -1,24 +0,0 @@
package io.github.ggerganov.whispercpp;
/**
* Presets for alignment heads in DTW token timestamps
*/
public class WhisperConstants {
// Alignment heads presets
public static final int WHISPER_AHEADS_NONE = 0;
public static final int WHISPER_AHEADS_TINY_EN = 1;
public static final int WHISPER_AHEADS_TINY = 2;
public static final int WHISPER_AHEADS_BASE_EN = 3;
public static final int WHISPER_AHEADS_BASE = 4;
public static final int WHISPER_AHEADS_SMALL_EN = 5;
public static final int WHISPER_AHEADS_SMALL = 6;
public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
public static final int WHISPER_AHEADS_MEDIUM = 8;
public static final int WHISPER_AHEADS_LARGE_V1 = 9;
public static final int WHISPER_AHEADS_LARGE_V2 = 10;
public static final int WHISPER_AHEADS_LARGE_V3 = 11;
public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
public static final int WHISPER_AHEADS_CUSTOM = 13;
public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
public static final int WHISPER_AHEADS_COUNT = 15;
}

View File

@ -1,36 +0,0 @@
package io.github.ggerganov.whispercpp;
import com.sun.jna.NativeLong;
import com.sun.jna.Structure;
import com.sun.jna.ptr.PointerByReference;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.ggml.GgmlType;
import io.github.ggerganov.whispercpp.WhisperModel;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import java.util.List;
public class WhisperContext extends Structure {
public NativeLong t_load_us;
public NativeLong t_start_us;
/** weight type (FP32 / FP16 / QX) */
public GgmlType wtype = GgmlType.GGML_TYPE_F16;
/** intermediate type (FP32 or FP16) */
public GgmlType itype = GgmlType.GGML_TYPE_F16;
public WhisperContextParams.ByValue params;
public Pointer model;
public Pointer vocab;
public Pointer state;
/** populated by whisper_init_from_file_with_params() */
public Pointer path_model;
@Override
protected List<String> getFieldOrder() {
return List.of("t_load_us", "t_start_us", "wtype", "itype",
"params", "model", "vocab", "state", "path_model");
}
}

View File

@ -1,216 +0,0 @@
package io.github.ggerganov.whispercpp;
import com.sun.jna.Native;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.bean.WhisperSegment;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Before calling most methods, you must call `initContext(modelPath)` to initialise the `ctx` Pointer.
*/
public class WhisperCpp implements AutoCloseable {
private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance;
private Pointer ctx = null;
private Pointer paramsPointer = null;
private Pointer greedyParamsPointer = null;
private Pointer beamParamsPointer = null;
public File modelDir() {
String modelDirPath = System.getenv("XDG_CACHE_HOME");
if (modelDirPath == null) {
modelDirPath = System.getProperty("user.home") + "/.cache";
}
return new File(modelDirPath, "whisper");
}
/**
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
*/
public void initContext(String modelPath) throws FileNotFoundException {
initContextImpl(modelPath, getContextDefaultParams());
}
/**
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
* @param params - params to use when initialising the context
*/
public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
initContextImpl(modelPath, params);
}
private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
if (ctx != null) {
lib.whisper_free(ctx);
}
if (!modelPath.contains("/") && !modelPath.contains("\\")) {
if (!modelPath.endsWith(".bin")) {
modelPath = "ggml-" + modelPath.replace("-", ".") + ".bin";
}
modelPath = new File(modelDir(), modelPath).getAbsolutePath();
}
ctx = lib.whisper_init_from_file_with_params(modelPath, params);
if (ctx == null) {
throw new FileNotFoundException(modelPath);
}
}
/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Returns a ByValue instance to ensure proper parameter passing to native code.
*/
public WhisperContextParams.ByValue getContextDefaultParams() {
WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
lib.whisper_context_default_params_by_ref());
valueParams.read();
return valueParams;
}
/**
* Provides default params which can be used with `whisper_full()` etc.
* Because this function allocates memory for the params, the caller must call either:
* - call `whisper_free_params()`
* - `Native.free(Pointer.nativeValue(pointer));`
*
* @param strategy - GREEDY
*/
public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
Pointer pointer;
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
if (strategy == WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY) {
if (greedyParamsPointer == null) {
greedyParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
}
pointer = greedyParamsPointer;
} else {
if (beamParamsPointer == null) {
beamParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
}
pointer = beamParamsPointer;
}
WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
params.read();
return params;
}
@Override
public void close() {
freeContext();
freeParams();
System.out.println("Whisper closed");
}
private void freeContext() {
if (ctx != null) {
lib.whisper_free(ctx);
}
}
private void freeParams() {
if (paramsPointer != null) {
Native.free(Pointer.nativeValue(paramsPointer));
paramsPointer = null;
}
if (greedyParamsPointer != null) {
Native.free(Pointer.nativeValue(greedyParamsPointer));
greedyParamsPointer = null;
}
if (beamParamsPointer != null) {
Native.free(Pointer.nativeValue(beamParamsPointer));
beamParamsPointer = null;
}
}
/**
* Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text.
* Not thread safe for same context
* Uses the specified decoding strategy to obtain the text.
*/
public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
if (ctx == null) {
throw new IllegalStateException("Model not initialised");
}
/*
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
valueParams.read();
*/
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
throw new IOException("Failed to process audio");
}
int nSegments = lib.whisper_full_n_segments(ctx);
StringBuilder str = new StringBuilder();
for (int i = 0; i < nSegments; i++) {
String text = lib.whisper_full_get_segment_text(ctx, i);
System.out.println("Segment:" + text);
str.append(text);
}
return str.toString().trim();
}
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
if (ctx == null) {
throw new IllegalStateException("Model not initialised");
}
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
valueParams.read();
if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
throw new IOException("Failed to process audio");
}
int nSegments = lib.whisper_full_n_segments(ctx);
List<WhisperSegment> segments= new ArrayList<>(nSegments);
for (int i = 0; i < nSegments; i++) {
long t0 = lib.whisper_full_get_segment_t0(ctx, i);
String text = lib.whisper_full_get_segment_text(ctx, i);
long t1 = lib.whisper_full_get_segment_t1(ctx, i);
segments.add(new WhisperSegment(t0,t1,text));
}
return segments;
}
// public int getTextSegmentCount(Pointer ctx) {
// return lib.whisper_full_n_segments(ctx);
// }
// public String getTextSegment(Pointer ctx, int index) {
// return lib.whisper_full_get_segment_text(ctx, index);
// }
public String getSystemInfo() {
return lib.whisper_print_system_info();
}
public int benchMemcpy(int nthread) {
return lib.whisper_bench_memcpy(nthread);
}
public int benchGgmlMulMat(int nthread) {
return lib.whisper_bench_ggml_mul_mat(nthread);
}
}

View File

@ -1,389 +0,0 @@
package io.github.ggerganov.whispercpp;
import com.sun.jna.Library;
import com.sun.jna.Native;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.model.WhisperModelLoader;
import io.github.ggerganov.whispercpp.model.WhisperTokenData;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
public interface WhisperCppJnaLibrary extends Library {
WhisperCppJnaLibrary instance = Native.load("whisper", WhisperCppJnaLibrary.class);
String whisper_print_system_info();
/**
* DEPRECATED. Allocate (almost) all memory needed for the model by loading from a file.
*
* @param path_model Path to the model file
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file(String path_model);
/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Because this function allocates memory for the params, the caller must call either:
* - call `whisper_free_context_params()`
* - `Native.free(Pointer.nativeValue(pointer));`
*/
Pointer whisper_context_default_params_by_ref();
void whisper_free_context_params(Pointer params);
/**
* Allocate (almost) all memory needed for the model by loading from a file.
*
* @param path_model Path to the model file
* @param params Pointer to whisper_context_params
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
/**
* Allocate (almost) all memory needed for the model by loading from a buffer.
*
* @param buffer Model buffer
* @param buffer_size Size of the model buffer
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_buffer(Pointer buffer, int buffer_size);
/**
* Allocate (almost) all memory needed for the model using a model loader.
*
* @param loader Model loader
* @return Whisper context on success, null on failure
*/
Pointer whisper_init(WhisperModelLoader loader);
/**
* Allocate (almost) all memory needed for the model by loading from a file without allocating the state.
*
* @param path_model Path to the model file
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file_no_state(String path_model);
/**
* Allocate (almost) all memory needed for the model by loading from a buffer without allocating the state.
*
* @param buffer Model buffer
* @param buffer_size Size of the model buffer
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_buffer_no_state(Pointer buffer, int buffer_size);
// Pointer whisper_init_from_buffer_no_state(Pointer buffer, long buffer_size);
/**
* Allocate (almost) all memory needed for the model using a model loader without allocating the state.
*
* @param loader Model loader
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_no_state(WhisperModelLoader loader);
/**
* Allocate memory for the Whisper state.
*
* @param ctx Whisper context
* @return Whisper state on success, null on failure
*/
Pointer whisper_init_state(Pointer ctx);
/**
* Free all allocated memory associated with the Whisper context.
*
* @param ctx Whisper context
*/
void whisper_free(Pointer ctx);
/**
* Free all allocated memory associated with the Whisper state.
*
* @param state Whisper state
*/
void whisper_free_state(Pointer state);
/**
* Convert RAW PCM audio to log mel spectrogram.
* The resulting spectrogram is stored inside the default state of the provided whisper context.
*
* @param ctx - Pointer to a WhisperContext
* @return 0 on success
*/
int whisper_pcm_to_mel(Pointer ctx, final float[] samples, int n_samples, int n_threads);
/**
* @param ctx Pointer to a WhisperContext
* @param state Pointer to WhisperState
* @param n_samples
* @param n_threads
* @return 0 on success
*/
int whisper_pcm_to_mel_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
/**
* This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
* Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
* n_mel must be 80
* @return 0 on success
*/
int whisper_set_mel(Pointer ctx, final float[] data, int n_len, int n_mel);
int whisper_set_mel_with_state(Pointer ctx, Pointer state, final float[] data, int n_len, int n_mel);
/**
* Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
* Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
* Offset can be used to specify the offset of the first frame in the spectrogram.
* @return 0 on success
*/
int whisper_encode(Pointer ctx, int offset, int n_threads);
int whisper_encode_with_state(Pointer ctx, Pointer state, int offset, int n_threads);
/**
* Run the Whisper decoder to obtain the logits and probabilities for the next token.
* Make sure to call whisper_encode() first.
* tokens + n_tokens is the provided context for the decoder.
* n_past is the number of tokens to use from previous decoder calls.
* Returns 0 on success
* TODO: add support for multiple decoders
*/
int whisper_decode(Pointer ctx, Pointer tokens, int n_tokens, int n_past, int n_threads);
/**
* @param ctx
* @param state
* @param tokens Pointer to int tokens
* @param n_tokens
* @param n_past
* @param n_threads
* @return
*/
int whisper_decode_with_state(Pointer ctx, Pointer state, Pointer tokens, int n_tokens, int n_past, int n_threads);
/**
* Convert the provided text into tokens.
* The tokens pointer must be large enough to hold the resulting tokens.
* Returns the number of tokens on success, no more than n_max_tokens
* Returns -1 on failure
* TODO: not sure if correct
*/
int whisper_tokenize(Pointer ctx, String text, Pointer tokens, int n_max_tokens);
/** Largest language id (i.e. number of available languages - 1) */
int whisper_lang_max_id();
/**
* @return the id of the specified language, returns -1 if not found.
* Examples:
* "de" -&gt; 2
* "german" -&gt; 2
*/
int whisper_lang_id(String lang);
/** @return the short string of the specified language id (e.g. 2 -&gt; "de"), returns nullptr if not found */
String whisper_lang_str(int id);
/**
* Use mel data at offset_ms to try and auto-detect the spoken language.
* Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
* Returns the top language id or negative on failure
* If not null, fills the lang_probs array with the probabilities of all languages
* The array must be whisper_lang_max_id() + 1 in size
*
* ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
*/
int whisper_lang_auto_detect(Pointer ctx, int offset_ms, int n_threads, float[] lang_probs);
int whisper_lang_auto_detect_with_state(Pointer ctx, Pointer state, int offset_ms, int n_threads, float[] lang_probs);
int whisper_n_len (Pointer ctx); // mel length
int whisper_n_len_from_state(Pointer state); // mel length
int whisper_n_vocab (Pointer ctx);
int whisper_n_text_ctx (Pointer ctx);
int whisper_n_audio_ctx (Pointer ctx);
int whisper_is_multilingual (Pointer ctx);
int whisper_model_n_vocab (Pointer ctx);
int whisper_model_n_audio_ctx (Pointer ctx);
int whisper_model_n_audio_state(Pointer ctx);
int whisper_model_n_audio_head (Pointer ctx);
int whisper_model_n_audio_layer(Pointer ctx);
int whisper_model_n_text_ctx (Pointer ctx);
int whisper_model_n_text_state (Pointer ctx);
int whisper_model_n_text_head (Pointer ctx);
int whisper_model_n_text_layer (Pointer ctx);
int whisper_model_n_mels (Pointer ctx);
int whisper_model_ftype (Pointer ctx);
int whisper_model_type (Pointer ctx);
/**
* Token logits obtained from the last call to whisper_decode().
* The logits for the last token are stored in the last row
* Rows: n_tokens
* Cols: n_vocab
*/
float[] whisper_get_logits (Pointer ctx);
float[] whisper_get_logits_from_state(Pointer state);
// Token Id -> String. Uses the vocabulary in the provided context
String whisper_token_to_str(Pointer ctx, int token);
String whisper_model_type_readable(Pointer ctx);
// Special tokens
int whisper_token_eot (Pointer ctx);
int whisper_token_sot (Pointer ctx);
int whisper_token_prev(Pointer ctx);
int whisper_token_solm(Pointer ctx);
int whisper_token_not (Pointer ctx);
int whisper_token_beg (Pointer ctx);
int whisper_token_lang(Pointer ctx, int lang_id);
// Task tokens
int whisper_token_translate (Pointer ctx);
int whisper_token_transcribe(Pointer ctx);
// Performance information from the default state.
void whisper_print_timings(Pointer ctx);
void whisper_reset_timings(Pointer ctx);
// Note: Even if `whisper_full_params is stripped back to just 4 ints, JNA throws "Invalid memory access"
// when `whisper_full_default_params()` tries to return a struct.
// WhisperFullParams whisper_full_default_params(int strategy);
/**
* Provides default params which can be used with `whisper_full()` etc.
* Because this function allocates memory for the params, the caller must call either:
* - call `whisper_free_params()`
* - `Native.free(Pointer.nativeValue(pointer));`
*
* @param strategy - WhisperSamplingStrategy.value
*/
Pointer whisper_full_default_params_by_ref(int strategy);
void whisper_free_params(Pointer params);
/**
* Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text
* Not thread safe for same context
* Uses the specified decoding strategy to obtain the text.
*/
int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
//int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
// Result is stored in the default state of the context
// Not thread safe if executed in parallel on the same context.
// It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
/**
* Number of generated text segments.
* A segment can be a few words, a sentence, or even a paragraph.
* @param ctx Pointer to WhisperContext
*/
int whisper_full_n_segments (Pointer ctx);
/**
* @param state Pointer to WhisperState
*/
int whisper_full_n_segments_from_state(Pointer state);
/**
* Language id associated with the context's default state.
* @param ctx Pointer to WhisperContext
*/
int whisper_full_lang_id(Pointer ctx);
/** Language id associated with the provided state */
int whisper_full_lang_id_from_state(Pointer state);
/** Get the start time of the specified segment. */
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
/** Get the start time of the specified segment from the state. */
long whisper_full_get_segment_t0_from_state(Pointer state, int i_segment);
/** Get the end time of the specified segment. */
long whisper_full_get_segment_t1(Pointer ctx, int i_segment);
/** Get the end time of the specified segment from the state. */
long whisper_full_get_segment_t1_from_state(Pointer state, int i_segment);
/** Get the text of the specified segment. */
String whisper_full_get_segment_text(Pointer ctx, int i_segment);
/** Get the text of the specified segment from the state. */
String whisper_full_get_segment_text_from_state(Pointer state, int i_segment);
/** Get the number of tokens in the specified segment. */
int whisper_full_n_tokens(Pointer ctx, int i_segment);
/** Get the number of tokens in the specified segment from the state. */
int whisper_full_n_tokens_from_state(Pointer state, int i_segment);
/** Get the token text of the specified token in the specified segment. */
String whisper_full_get_token_text(Pointer ctx, int i_segment, int i_token);
/** Get the token text of the specified token in the specified segment from the state. */
String whisper_full_get_token_text_from_state(Pointer ctx, Pointer state, int i_segment, int i_token);
/** Get the token ID of the specified token in the specified segment. */
int whisper_full_get_token_id(Pointer ctx, int i_segment, int i_token);
/** Get the token ID of the specified token in the specified segment from the state. */
int whisper_full_get_token_id_from_state(Pointer state, int i_segment, int i_token);
/** Get token data for the specified token in the specified segment. */
WhisperTokenData whisper_full_get_token_data(Pointer ctx, int i_segment, int i_token);
/** Get token data for the specified token in the specified segment from the state. */
WhisperTokenData whisper_full_get_token_data_from_state(Pointer state, int i_segment, int i_token);
/** Get the probability of the specified token in the specified segment. */
float whisper_full_get_token_p(Pointer ctx, int i_segment, int i_token);
/** Get the probability of the specified token in the specified segment from the state. */
float whisper_full_get_token_p_from_state(Pointer state, int i_segment, int i_token);
/**
* Benchmark function for memcpy.
*
* @param nThreads Number of threads to use for the benchmark.
* @return The result of the benchmark.
*/
int whisper_bench_memcpy(int nThreads);
/**
* Benchmark function for memcpy as a string.
*
* @param nThreads Number of threads to use for the benchmark.
* @return The result of the benchmark as a string.
*/
String whisper_bench_memcpy_str(int nThreads);
/**
* Benchmark function for ggml_mul_mat.
*
* @param nThreads Number of threads to use for the benchmark.
* @return The result of the benchmark.
*/
int whisper_bench_ggml_mul_mat(int nThreads);
/**
* Benchmark function for ggml_mul_mat as a string.
*
* @param nThreads Number of threads to use for the benchmark.
* @return The result of the benchmark as a string.
*/
String whisper_bench_ggml_mul_mat_str(int nThreads);
}

View File

@ -1,47 +0,0 @@
package io.github.ggerganov.whispercpp.bean;
/**
* Created by litonglinux@qq.com on 10/21/2023_7:48 AM
*/
public class WhisperSegment {
private long start, end;
private String sentence;
public WhisperSegment() {
}
public WhisperSegment(long start, long end, String sentence) {
this.start = start;
this.end = end;
this.sentence = sentence;
}
public long getStart() {
return start;
}
public long getEnd() {
return end;
}
public String getSentence() {
return sentence;
}
public void setStart(long start) {
this.start = start;
}
public void setEnd(long end) {
this.end = end;
}
public void setSentence(String sentence) {
this.sentence = sentence;
}
@Override
public String toString() {
return "[" + start + " --> " + end + "]:" + sentence;
}
}

View File

@ -1,17 +0,0 @@
package io.github.ggerganov.whispercpp.callbacks;
import com.sun.jna.Callback;
/**
* Callback for aborting GGML computation
* Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
*/
public interface GgmlAbortCallback extends Callback {
/**
* Return true to abort the computation, false to continue
*
* @param data User data passed to the callback
* @return true to abort, false to continue
*/
boolean invoke(com.sun.jna.Pointer data);
}

View File

@ -1,24 +0,0 @@
package io.github.ggerganov.whispercpp.callbacks;
import com.sun.jna.Callback;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.WhisperContext;
import io.github.ggerganov.whispercpp.model.WhisperState;
/**
* Callback before the encoder starts.
* If not null, called before the encoder starts.
* If it returns false, the computation is aborted.
*/
public interface WhisperEncoderBeginCallback extends Callback {
/**
* Callback method before the encoder starts.
*
* @param ctx The whisper context.
* @param state The whisper state.
* @param user_data User data.
* @return True if the computation should proceed, false otherwise.
*/
boolean callback(Pointer ctx, Pointer state, Pointer user_data);
}

View File

@ -1,25 +0,0 @@
package io.github.ggerganov.whispercpp.callbacks;
import com.sun.jna.Callback;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.model.WhisperTokenData;
/**
* Callback to filter logits.
* Can be used to modify the logits before sampling.
* If not null, called after applying temperature to logits.
*/
public interface WhisperLogitsFilterCallback extends Callback {
/**
* Callback method to filter logits.
*
* @param ctx The whisper context.
* @param state The whisper state.
* @param tokens The array of whisper_token_data.
* @param n_tokens The number of tokens.
* @param logits The array of logits.
* @param user_data User data.
*/
void callback(Pointer ctx, Pointer state, WhisperTokenData[] tokens, int n_tokens, float[] logits, Pointer user_data);
}

View File

@ -1,24 +0,0 @@
package io.github.ggerganov.whispercpp.callbacks;
import com.sun.jna.Callback;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.WhisperContext;
import io.github.ggerganov.whispercpp.model.WhisperState;
/**
* Callback for the text segment.
* Called on every newly generated text segment.
* Use the whisper_full_...() functions to obtain the text segments.
*/
public interface WhisperNewSegmentCallback extends Callback {
/**
* Callback method for the text segment.
*
* @param ctx The whisper context.
* @param state The whisper state.
* @param n_new The number of newly generated text segments.
* @param user_data User data.
*/
void callback(Pointer ctx, Pointer state, int n_new, Pointer user_data);
}

View File

@ -1,22 +0,0 @@
package io.github.ggerganov.whispercpp.callbacks;
import com.sun.jna.Callback;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.WhisperContext;
import io.github.ggerganov.whispercpp.model.WhisperState;
/**
* Callback for progress updates.
*/
public interface WhisperProgressCallback extends Callback {
/**
* Callback method for progress updates.
*
* @param ctx The whisper context.
* @param state The whisper state.
* @param progress The progress value.
* @param user_data User data.
*/
void callback(Pointer ctx, Pointer state, int progress, Pointer user_data);
}

View File

@ -1,4 +0,0 @@
package io.github.ggerganov.whispercpp.ggml;
public class GgmlTensor {
}

View File

@ -1,18 +0,0 @@
package io.github.ggerganov.whispercpp.ggml;
public enum GgmlType {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
REMOVED_GGML_TYPE_Q4_2, // support has been removed
REMOVED_GGML_TYPE_Q4_3, // support has been removed
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q8_1,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
}

View File

@ -1,10 +0,0 @@
package io.github.ggerganov.whispercpp.model;
public enum EModel {
MODEL_UNKNOWN,
MODEL_TINY,
MODEL_BASE,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
}

View File

@ -1,49 +0,0 @@
package io.github.ggerganov.whispercpp;
import io.github.ggerganov.whispercpp.ggml.GgmlTensor;
import io.github.ggerganov.whispercpp.model.EModel;
public class WhisperModel {
// EModel type = EModel.MODEL_UNKNOWN;
//
// WhisperHParams hparams;
// WhisperFilters filters;
//
// // encoder.positional_embedding
// GgmlTensor e_pe;
//
// // encoder.conv1
// GgmlTensor e_conv_1_w;
// GgmlTensor e_conv_1_b;
//
// // encoder.conv2
// GgmlTensor e_conv_2_w;
// GgmlTensor e_conv_2_b;
//
// // encoder.ln_post
// GgmlTensor e_ln_w;
// GgmlTensor e_ln_b;
//
// // decoder.positional_embedding
// GgmlTensor d_pe;
//
// // decoder.token_embedding
// GgmlTensor d_te;
//
// // decoder.ln
// GgmlTensor d_ln_w;
// GgmlTensor d_ln_b;
//
// std::vector<whisper_layer_encoder> layers_encoder;
// std::vector<whisper_layer_decoder> layers_decoder;
//
// // context
// struct ggml_context * ctx;
//
// // the model memory buffer is read-only and can be shared between processors
// std::vector<uint8_t> * buf;
//
// // tensors
// int n_loaded;
// Map<String, GgmlTensor> tensors;
}

View File

@ -1,62 +0,0 @@
package io.github.ggerganov.whispercpp.model;
import com.sun.jna.Callback;
import com.sun.jna.Pointer;
import com.sun.jna.Structure;
public class WhisperModelLoader extends Structure {
public Pointer context;
public ReadFunction read;
public EOFFunction eof;
public CloseFunction close;
public static class ReadFunction implements Callback {
public Pointer invoke(Pointer ctx, Pointer output, int readSize) {
// TODO
return ctx;
}
}
public static class EOFFunction implements Callback {
public boolean invoke(Pointer ctx) {
// TODO
return false;
}
}
public static class CloseFunction implements Callback {
public void invoke(Pointer ctx) {
// TODO
}
}
// public WhisperModelLoader(Pointer p) {
// super(p);
// read = new ReadFunction();
// eof = new EOFFunction();
// close = new CloseFunction();
// read.setCallback(this);
// eof.setCallback(this);
// close.setCallback(this);
// read.write();
// eof.write();
// close.write();
// }
public WhisperModelLoader() {
super();
}
public interface ReadCallback extends Callback {
Pointer invoke(Pointer ctx, Pointer output, int readSize);
}
public interface EOFCallback extends Callback {
boolean invoke(Pointer ctx);
}
public interface CloseCallback extends Callback {
void invoke(Pointer ctx);
}
}

View File

@ -1,4 +0,0 @@
package io.github.ggerganov.whispercpp.model;
public class WhisperState {
}

View File

@ -1,50 +0,0 @@
package io.github.ggerganov.whispercpp.model;
import com.sun.jna.Structure;
import java.util.Arrays;
import java.util.List;
/**
* Structure representing token data.
*/
public class WhisperTokenData extends Structure {
/** Token ID. */
public int id;
/** Forced timestamp token ID. */
public int tid;
/** Probability of the token. */
public float p;
/** Log probability of the token. */
public float plog;
/** Probability of the timestamp token. */
public float pt;
/** Sum of probabilities of all timestamp tokens. */
public float ptsum;
/**
* Start time of the token (token-level timestamp data).
* Do not use if you haven't computed token-level timestamps.
*/
public long t0;
/**
* End time of the token (token-level timestamp data).
* Do not use if you haven't computed token-level timestamps.
*/
public long t1;
/** Voice length of the token. */
public float vlen;
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("id", "tid", "p", "plog", "pt", "ptsum", "t0", "t1", "vlen");
}
}

View File

@ -1,19 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.Structure;
import java.util.Arrays;
import java.util.List;
public class BeamSearchParams extends Structure {
/** ref: <a href="https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265">...</a> */
public int beam_size;
/** ref: <a href="https://arxiv.org/pdf/2204.05424.pdf">...</a> */
public float patience;
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("beam_size", "patience");
}
}

View File

@ -1,30 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.IntegerType;
import java.util.function.BooleanSupplier;
public class CBool extends IntegerType implements BooleanSupplier {
public static final int SIZE = 1;
public static final CBool FALSE = new CBool(0);
public static final CBool TRUE = new CBool(1);
public CBool() {
this(0);
}
public CBool(long value) {
super(SIZE, value, true);
}
@Override
public boolean getAsBoolean() {
return intValue() == 1;
}
@Override
public String toString() {
return intValue() == 1 ? "true" : "false";
}
}

View File

@ -1,16 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.Structure;
import java.util.Collections;
import java.util.List;
public class GreedyParams extends Structure {
/** <a href="https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264">...</a> */
public int best_of;
@Override
protected List<String> getFieldOrder() {
return Collections.singletonList("best_of");
}
}

View File

@ -1,30 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;
public class WhisperAhead extends Structure {
public int n_text_layer;
public int n_head;
public WhisperAhead() {
super();
}
public WhisperAhead(int textLayer, int head) {
super();
this.n_text_layer = textLayer;
this.n_head = head;
}
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("n_text_layer", "n_head");
}
public static class ByReference extends WhisperAhead implements Structure.ByReference {}
public static class ByValue extends WhisperAhead implements Structure.ByValue {}
}

View File

@ -1,41 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;
public class WhisperAheads extends Structure {
public NativeLong n_heads;
public Pointer heads;
public WhisperAheads() {
super();
}
/**
* Create alignment heads from an array of WhisperAhead objects
*/
public void setHeads(WhisperAhead[] aheadsArray) {
this.n_heads = new NativeLong(aheadsArray.length);
int structSize = aheadsArray[0].size();
Memory mem = new Memory(structSize * aheadsArray.length);
for (int i = 0; i < aheadsArray.length; i++) {
aheadsArray[i].write();
byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
mem.write(i * structSize, buffer, 0, buffer.length);
}
this.heads = mem;
}
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("n_heads", "heads");
}
public static class ByReference extends WhisperAheads implements Structure.ByReference {}
public static class ByValue extends WhisperAheads implements Structure.ByValue {}
}

View File

@ -1,81 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;
/**
* Parameters for the whisper_init_from_file_with_params() function.
* If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
* whisper_context_default_params()
*/
public class WhisperContextParams extends Structure {
public WhisperContextParams(Pointer p) {
super(p);
}
public WhisperContextParams() {
super();
}
/** Use GPU for inference (default = true) */
public CBool use_gpu;
/** Use flash attention (default = false) */
public CBool flash_attn;
/** CUDA device to use (default = 0) */
public int gpu_device;
/** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
public CBool dtw_token_timestamps;
/** [EXPERIMENTAL] Alignment heads preset for DTW */
public int dtw_aheads_preset;
/** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
public int dtw_n_top;
public WhisperAheads.ByValue dtw_aheads;
/** DTW memory size (internal use) */
public NativeLong dtw_mem_size;
/** Use GPU for inference */
public void useGpu(boolean enable) {
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
}
/** Use flash attention */
public void useFlashAttn(boolean enable) {
flash_attn = enable ? CBool.TRUE : CBool.FALSE;
}
/** Enable DTW token-level timestamps */
public void enableDtwTokenTimestamps(boolean enable) {
dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
}
/** Set DTW alignment heads preset */
public void setDtwAheadsPreset(int preset) {
dtw_aheads_preset = preset;
}
@Override
protected List<String> getFieldOrder() {
return Arrays.asList(
"use_gpu",
"flash_attn",
"gpu_device",
"dtw_token_timestamps",
"dtw_aheads_preset",
"dtw_n_top",
"dtw_aheads",
"dtw_mem_size"
);
}
public static class ByValue extends WhisperContextParams implements Structure.ByValue {
public ByValue() { super(); }
public ByValue(Pointer p) { super(p); }
}
}

View File

@ -1,10 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import java.util.List;
public class WhisperFilters {
int n_mel;
int n_fft;
List<Float> data;
}

View File

@ -1,358 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.*;
import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;
import java.util.Arrays;
import java.util.List;
/**
* Parameters for the whisper_full() function.
* If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
* whisper_full_default_params()
*/
public class WhisperFullParams extends Structure {
public WhisperFullParams() {
super();
}
public WhisperFullParams(Pointer p) {
super(p);
}
/** Sampling strategy for whisper_full() function. */
public int strategy;
/** Number of threads. (default = 4) */
public int n_threads;
/** Maximum tokens to use from past text as a prompt for the decoder. (default = 16384) */
public int n_max_text_ctx;
/** Start offset in milliseconds. (default = 0) */
public int offset_ms;
/** Audio duration to process in milliseconds. (default = 0) */
public int duration_ms;
/** Translate flag. (default = false) */
public CBool translate;
/** The compliment of translateMode() */
public void transcribeMode() {
translate = CBool.FALSE;
}
/** The compliment of transcribeMode() */
public void translateMode() {
translate = CBool.TRUE;
}
/** Flag to indicate whether to use past transcription (if any) as an initial prompt for the decoder. (default = true) */
public CBool no_context;
/** Flag to indicate whether to use past transcription (if any) as an initial prompt for the decoder. (default = true) */
public void enableContext(boolean enable) {
no_context = enable ? CBool.FALSE : CBool.TRUE;
}
/** Generate timestamps or not? */
public CBool no_timestamps;
/** Flag to force single segment output (useful for streaming). (default = false) */
public CBool single_segment;
/** Flag to force single segment output (useful for streaming). (default = false) */
public void singleSegment(boolean single) {
single_segment = single ? CBool.TRUE : CBool.FALSE;
}
/** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
public CBool print_special;
/** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
public void printSpecial(boolean enable) {
print_special = enable ? CBool.TRUE : CBool.FALSE;
}
/** Flag to print progress information. (default = true) */
public CBool print_progress;
/** Flag to print progress information. (default = true) */
public void printProgress(boolean enable) {
print_progress = enable ? CBool.TRUE : CBool.FALSE;
}
/** Flag to print results from within whisper.cpp (avoid it, use callback instead). (default = true) */
public CBool print_realtime;
/** Flag to print results from within whisper.cpp (avoid it, use callback instead). (default = true) */
public void printRealtime(boolean enable) {
print_realtime = enable ? CBool.TRUE : CBool.FALSE;
}
/** Flag to print timestamps for each text segment when printing realtime. (default = true) */
public CBool print_timestamps;
/** Flag to print timestamps for each text segment when printing realtime. (default = true) */
public void printTimestamps(boolean enable) {
print_timestamps = enable ? CBool.TRUE : CBool.FALSE;
}
/** [EXPERIMENTAL] Flag to enable token-level timestamps. (default = false) */
public CBool token_timestamps;
/** [EXPERIMENTAL] Flag to enable token-level timestamps. (default = false) */
public void tokenTimestamps(boolean enable) {
token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
}
/** [EXPERIMENTAL] Timestamp token probability threshold (~0.01). (default = 0.01) */
public float thold_pt;
/** [EXPERIMENTAL] Timestamp token sum probability threshold (~0.01). */
public float thold_ptsum;
/** Maximum segment length in characters. (default = 0) */
public int max_len;
/** Flag to split on word rather than on token (when used with max_len). (default = false) */
public CBool split_on_word;
/** Flag to split on word rather than on token (when used with max_len). (default = false) */
public void splitOnWord(boolean enable) {
split_on_word = enable ? CBool.TRUE : CBool.FALSE;
}
/** Maximum tokens per segment (0, default = no limit) */
public int max_tokens;
/** [EXPERIMENTAL] Enable debug mode for extra info */
public CBool debug_mode;
/** Enable debug mode */
public void enableDebugMode(boolean enable) {
debug_mode = enable ? CBool.TRUE : CBool.FALSE;
}
/** Overwrite the audio context size (0 = use default). */
public int audio_ctx;
/** Enable tinydiarize (default = false) */
public CBool tdrz_enable;
/** Enable tinydiarize (default = false) */
public void tdrzEnable(boolean enable) {
tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
}
/** Regular expression matching tokens to suppress. */
public String suppress_regex;
/** Tokens to provide to the whisper decoder as an initial prompt.
* These are prepended to any existing text context from a previous call. */
public String initial_prompt;
/** Prompt tokens. (int*) */
public Pointer prompt_tokens;
public void setPromptTokens(int[] tokens) {
Memory mem = new Memory(tokens.length * 4L);
mem.write(0, tokens, 0, tokens.length);
prompt_tokens = mem;
}
/** Number of prompt tokens. */
public int prompt_n_tokens;
/** Language for auto-detection.
* For auto-detection, set to `null`, `""`, or "auto". */
public String language;
/** Flag to indicate whether to detect language automatically. */
public CBool detect_language;
/** Flag to indicate whether to detect language automatically. */
public void detectLanguage(boolean enable) {
detect_language = enable ? CBool.TRUE : CBool.FALSE;
}
// Common decoding parameters.
/** Flag to suppress blank tokens. */
public CBool suppress_blank;
public void suppressBlanks(boolean enable) {
suppress_blank = enable ? CBool.TRUE : CBool.FALSE;
}
/** Flag to suppress non-speech tokens. */
public CBool suppress_nst;
/** Flag to suppress non-speech tokens. */
public void suppressNonSpeechTokens(boolean enable) {
suppress_nst = enable ? CBool.TRUE : CBool.FALSE;
}
/** Initial decoding temperature. */
public float temperature;
/** Maximum initial timestamp. */
public float max_initial_ts;
/** Length penalty. */
public float length_penalty;
// Fallback parameters.
/** Temperature increment. */
public float temperature_inc;
/** Entropy threshold (similar to OpenAI's "compression_ratio_threshold"). */
public float entropy_thold;
/** Log probability threshold. */
public float logprob_thold;
/** No speech threshold. */
public float no_speech_thold;
/** Greedy decoding parameters. */
public GreedyParams greedy;
/**
* Beam search decoding parameters.
*/
public BeamSearchParams beam_search;
public void setBestOf(int bestOf) {
if (greedy == null) {
greedy = new GreedyParams();
}
greedy.best_of = bestOf;
}
public void setBeamSize(int beamSize) {
if (beam_search == null) {
beam_search = new BeamSearchParams();
}
beam_search.beam_size = beamSize;
}
public void setBeamSizeAndPatience(int beamSize, float patience) {
if (beam_search == null) {
beam_search = new BeamSearchParams();
}
beam_search.beam_size = beamSize;
beam_search.patience = patience;
}
/**
* Callback for every newly generated text segment.
* WhisperNewSegmentCallback
*/
public Pointer new_segment_callback;
/**
* User data for the new_segment_callback.
*/
public Pointer new_segment_callback_user_data;
/**
* Callback on each progress update.
* WhisperProgressCallback
*/
public Pointer progress_callback;
/**
* User data for the progress_callback.
*/
public Pointer progress_callback_user_data;
/**
* Callback each time before the encoder starts.
* WhisperEncoderBeginCallback
*/
public Pointer encoder_begin_callback;
/**
* User data for the encoder_begin_callback.
*/
public Pointer encoder_begin_callback_user_data;
/** Callback used to abort GGML computation */
public Pointer abort_callback;
/** User data for the abort_callback */
public Pointer abort_callback_user_data;
public void setAbortCallback(GgmlAbortCallback callback) {
abort_callback = CallbackReference.getFunctionPointer(callback);
}
/**
* Callback by each decoder to filter obtained logits.
* WhisperLogitsFilterCallback
*/
public Pointer logits_filter_callback;
/**
* User data for the logits_filter_callback.
*/
public Pointer logits_filter_callback_user_data;
public void setNewSegmentCallback(WhisperNewSegmentCallback callback) {
new_segment_callback = CallbackReference.getFunctionPointer(callback);
}
public void setProgressCallback(WhisperProgressCallback callback) {
progress_callback = CallbackReference.getFunctionPointer(callback);
}
public void setEncoderBeginCallbackeginCallbackCallback(WhisperEncoderBeginCallback callback) {
encoder_begin_callback = CallbackReference.getFunctionPointer(callback);
}
public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
logits_filter_callback = CallbackReference.getFunctionPointer(callback);
}
/** Grammar stuff */
public Pointer grammar_rules;
public long n_grammar_rules;
public long i_start_rule;
public float grammar_penalty;
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
"offset_ms", "duration_ms", "translate", "no_context",
"no_timestamps", "single_segment", "print_special",
"print_progress", "print_realtime", "print_timestamps",
"token_timestamps", "thold_pt", "thold_ptsum", "max_len",
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
"tdrz_enable", "suppress_regex", "initial_prompt",
"prompt_tokens", "prompt_n_tokens", "language", "detect_language",
"suppress_blank", "suppress_nst", "temperature",
"max_initial_ts", "length_penalty", "temperature_inc",
"entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
"beam_search", "new_segment_callback", "new_segment_callback_user_data",
"progress_callback", "progress_callback_user_data",
"encoder_begin_callback", "encoder_begin_callback_user_data",
"abort_callback", "abort_callback_user_data",
"logits_filter_callback", "logits_filter_callback_user_data",
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
}
public static class ByValue extends WhisperFullParams implements Structure.ByValue {
public ByValue() { super(); }
public ByValue(Pointer p) { super(p); }
}
}

View File

@ -1,15 +0,0 @@
package io.github.ggerganov.whispercpp.params;
public class WhisperHParams {
int n_vocab = 51864;
int n_audio_ctx = 1500;
int n_audio_state = 384;
int n_audio_head = 6;
int n_audio_layer = 4;
int n_text_ctx = 448;
int n_text_state = 384;
int n_text_head = 6;
int n_text_layer = 4;
int n_mels = 80;
int ftype = 1;
}

View File

@ -1,10 +0,0 @@
package io.github.ggerganov.whispercpp.params;
/** Available sampling strategies */
public enum WhisperSamplingStrategy {
/** similar to OpenAI's GreedyDecoder */
WHISPER_SAMPLING_GREEDY,
/** similar to OpenAI's BeamSearchDecoder */
WHISPER_SAMPLING_BEAM_SEARCH
}

View File

@ -1,144 +0,0 @@
package io.github.ggerganov.whispercpp;
import static org.junit.jupiter.api.Assertions.*;
import io.github.ggerganov.whispercpp.bean.WhisperSegment;
import io.github.ggerganov.whispercpp.params.CBool;
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
class WhisperCppTest {
private static WhisperCpp whisper = new WhisperCpp();
private static boolean modelInitialised = false;
@BeforeAll
static void init() throws FileNotFoundException {
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
// or you can provide the absolute path to the model file.
//String modelName = "../../models/ggml-tiny.bin";
String modelName = "../../models/ggml-tiny.en.bin";
try {
whisper.initContext(modelName);
//whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
//whisper.getJavaDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
modelInitialised = true;
} catch (FileNotFoundException ex) {
System.out.println("Model " + modelName + " not found");
}
}
@Test
void testGetDefaultFullParams_BeamSearch() {
// When
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
// Then
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal(), params.strategy);
assertNotEquals(0, params.n_threads);
assertEquals(16384, params.n_max_text_ctx);
assertFalse(params.translate);
assertEquals(0.01f, params.thold_pt);
assertEquals(5, params.beam_search.beam_size);
assertEquals(-1.0f, params.beam_search.patience);
}
@Test
void testGetDefaultFullParams_Greedy() {
// When
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
// Then
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
assertNotEquals(0, params.n_threads);
assertEquals(16384, params.n_max_text_ctx);
assertEquals(5, params.greedy.best_of);
}
@Test
void testFullTranscribe() throws Exception {
if (!modelInitialised) {
System.out.println("Model not initialised, skipping test");
return;
}
// Given
File file = new File(System.getProperty("user.dir"), "../../samples/jfk.wav");
AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(file);
byte[] b = new byte[audioInputStream.available()];
float[] floats = new float[b.length / 2];
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
params.print_progress = CBool.FALSE;
//params.initial_prompt = "and so my fellow Americans um, like";
try {
audioInputStream.read(b);
for (int i = 0, j = 0; i < b.length; i += 2, j++) {
int intSample = (int) (b[i + 1]) << 8 | (int) (b[i]) & 0xFF;
floats[j] = intSample / 32767.0f;
}
// When
String result = whisper.fullTranscribe(params, floats);
// Then
System.err.println(result);
assertEquals("And so my fellow Americans ask not what your country can do for you " +
"ask what you can do for your country.",
result.replace(",", ""));
} finally {
audioInputStream.close();
}
}
@Test
void testFullTranscribeWithTime() throws Exception {
if (!modelInitialised) {
System.out.println("Model not initialised, skipping test");
return;
}
// Given
File file = new File(System.getProperty("user.dir"), "../../samples/jfk.wav");
AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(file);
byte[] b = new byte[audioInputStream.available()];
float[] floats = new float[b.length / 2];
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
params.print_progress = CBool.FALSE;
//params.initial_prompt = "and so my fellow Americans um, like";
try {
audioInputStream.read(b);
for (int i = 0, j = 0; i < b.length; i += 2, j++) {
int intSample = (int) (b[i + 1]) << 8 | (int) (b[i]) & 0xFF;
floats[j] = intSample / 32767.0f;
}
List<WhisperSegment> segments = whisper.fullTranscribeWithTime(params, floats);
assertTrue(segments.size() > 0, "The size of segments should be greater than 0");
for (WhisperSegment segment : segments) {
System.out.println(segment);
}
} finally {
audioInputStream.close();
}
}
}

View File

@ -1,17 +0,0 @@
package io.github.ggerganov.whispercpp;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
class WhisperJnaLibraryTest {
@Test
void testWhisperPrint_system_info() {
String systemInfo = WhisperCppJnaLibrary.instance.whisper_print_system_info();
// eg: "AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0
// | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | "
System.out.println("System info: " + systemInfo);
assertTrue(systemInfo.length() > 10);
}
}

View File

@ -33,9 +33,6 @@ mkdir build-em && cd build-em
emcmake cmake .. && make -j
# run test
node ../tests/test-whisper.js
# For Node.js versions prior to v16.4.0, experimental features need to be enabled:
node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
# publish npm package
@ -44,7 +41,7 @@ make publish-npm
## Sample run
```text
```java
$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
whisper_model_load: loading model from 'whisper.bin'
@ -66,7 +63,7 @@ whisper_model_load: ggml ctx size = 140.60 MB
whisper_model_load: memory size = 22.83 MB
whisper_model_load: model size = 140.54 MB
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...

View File

@ -20,7 +20,7 @@ struct whisper_context * g_context;
EMSCRIPTEN_BINDINGS(whisper) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
if (g_context == nullptr) {
g_context = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_context = whisper_init_from_file(path_model.c_str());
if (g_context != nullptr) {
return true;
} else {

View File

@ -1 +1 @@
"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:f=>(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f),postMessage:msg=>parentPort.postMessage(msg),performance:global.performance||{now:Date.now}})}var initializedJS=false;function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var module=Module["wasmModule"];Module["wasmModule"]=null;var instance=new WebAssembly.Instance(module,info);return receiveInstance(instance)};self.onunhandledrejection=e=>{throw e.reason||e};function handleMessage(e){try{if(e.data.cmd==="load"){let messageQueue=[];self.onmessage=e=>messageQueue.push(e);self.startWorker=instance=>{Module=instance;postMessage({"cmd":"loaded"});for(let msg of messageQueue){handleMessage(msg)}self.onmessage=handleMessage};Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=(...args)=>{postMessage({cmd:"callHandler",handler:handler,args:args})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module)}else if(e.data.cmd==="run"){Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["__emscripten_thread_mailbox_await"](e.data.pthread_ptr);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){throw ex}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="checkMailbox"){if(initializedJS){Module["checkMailbox"]()}}else if(e.data.cmd){err(`worker.js received unknown command ${e.data.cmd}`);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}}self.onmessage=handleMessage;
"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:function(f){(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f)},postMessage:function(msg){parentPort.postMessage(msg)},performance:global.performance||{now:function(){return Date.now()}}})}var initializedJS=false;var pendingNotifiedProxyingQueues=[];function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var instance=new WebAssembly.Instance(Module["wasmModule"],info);receiveInstance(instance);Module["wasmModule"]=null;return instance.exports};self.onunhandledrejection=e=>{throw e.reason??e};self.onmessage=e=>{try{if(e.data.cmd==="load"){Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=function(){postMessage({cmd:"callHandler",handler:handler,args:[...arguments]})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module).then(function(instance){Module=instance})}else if(e.data.cmd==="run"){Module["__performance_now_clock_drift"]=performance.now()-e.data.time;Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();pendingNotifiedProxyingQueues.forEach(queue=>{Module["executeNotifiedProxyingQueue"](queue)});pendingNotifiedProxyingQueues=[];initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){if(ex instanceof Module["ExitStatus"]){if(Module["keepRuntimeAlive"]()){}else{Module["__emscripten_thread_exit"](ex.status)}}else{throw ex}}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="processProxyingQueue"){if(initializedJS){Module["executeNotifiedProxyingQueue"](e.data.queue)}else{pendingNotifiedProxyingQueues.push(e.data.queue)}}else if(e.data.cmd){err("worker.js received unknown command "+e.data.cmd);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}};

View File

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.7.4",
"version": "1.2.0",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

File diff suppressed because one or more lines are too long

View File

@ -1,3 +0,0 @@
LICENSE
pkg/
lib/whisper.*

View File

@ -1,245 +0,0 @@
whispercpp
==========
![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
Installation
------------
Install the gem and add to the application's Gemfile by executing:
$ bundle add whispercpp
If bundler is not being used to manage dependencies, install the gem by executing:
$ gem install whispercpp
Usage
-----
```ruby
require "whisper"
whisper = Whisper::Context.new("base")
params = Whisper::Params.new(
language: "en",
offset: 10_000,
duration: 60_000,
max_text_tokens: 300,
translate: true,
print_timestamps: false,
initial_prompt: "Initial prompt here."
)
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
puts whole_text
end
```
### Preparing model ###
Some models are prepared up-front:
```ruby
base_en = Whisper::Model.pre_converted_models["base.en"]
whisper = Whisper::Context.new(base_en)
```
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
```ruby
Whisper::Model.pre_converted_models["base"].clear_cache
```
You also can use shorthand for pre-converted models:
```ruby
whisper = Whisper::Context.new("base.en")
```
You can see the list of prepared model names by `Whisper::Model.pre_converted_models.keys`:
```ruby
puts Whisper::Model.pre_converted_models.keys
# tiny
# tiny.en
# tiny-q5_1
# tiny.en-q5_1
# tiny-q8_0
# base
# base.en
# base-q5_1
# base.en-q5_1
# base-q8_0
# :
# :
```
You can also use local model files you prepared:
```ruby
whisper = Whisper::Context.new("path/to/your/model.bin")
```
Or, you can download model files:
```ruby
whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
# Or
whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
```
See [models][] page for details.
### Preparing audio file ###
Currently, whisper.cpp accepts only 16-bit WAV files.
API
---
### Segments ###
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
```ruby
def format_time(time_ms)
sec, decimal_part = time_ms.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
end
whisper
.transcribe("path/to/audio.wav", params)
.each_segment.with_index do |segment, index|
line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
nth: index + 1,
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
```
You can also add hook to params called on new segment:
```ruby
# Add hook before calling #transcribe
params.on_new_segment do |segment|
line = "[%{st} --> %{ed}] %{text}" % {
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
whisper.transcribe("path/to/audio.wav", params)
```
### Models ###
You can see model information:
```ruby
whisper = Whisper::Context.new("base")
model = whisper.model
model.n_vocab # => 51864
model.n_audio_ctx # => 1500
model.n_audio_state # => 512
model.n_audio_head # => 8
model.n_audio_layer # => 6
model.n_text_ctx # => 448
model.n_text_state # => 512
model.n_text_head # => 8
model.n_text_layer # => 6
model.n_mels # => 80
model.ftype # => 1
model.type # => "base"
```
### Logging ###
You can set log callback:
```ruby
prefix = "[MyApp] "
log_callback = ->(level, buffer, user_data) {
case level
when Whisper::LOG_LEVEL_NONE
puts "#{user_data}none: #{buffer}"
when Whisper::LOG_LEVEL_INFO
puts "#{user_data}info: #{buffer}"
when Whisper::LOG_LEVEL_WARN
puts "#{user_data}warn: #{buffer}"
when Whisper::LOG_LEVEL_ERROR
puts "#{user_data}error: #{buffer}"
when Whisper::LOG_LEVEL_DEBUG
puts "#{user_data}debug: #{buffer}"
when Whisper::LOG_LEVEL_CONT
puts "#{user_data}same to previous: #{buffer}"
end
}
Whisper.log_set log_callback, prefix
```
Using this feature, you are also able to suppress log:
```ruby
Whisper.log_set ->(level, buffer, user_data) {
# do nothing
}, nil
Whisper::Context.new("base")
```
### Low-level API to transcribe ###
You can also call `Whisper::Context#full` and `#full_parallel` with a Ruby array as samples. Although `#transcribe` with audio file path is recommended because it extracts PCM samples in C++ and is fast, `#full` and `#full_parallel` give you flexibility.
```ruby
require "whisper"
require "wavefile"
reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000))
samples = reader.enum_for(:each_buffer).map(&:samples).flatten
whisper = Whisper::Context.new("base")
whisper
.full(Whisper::Params.new, samples)
.each_segment do |segment|
puts segment.text
end
```
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
Development
-----------
% git clone https://github.com/ggerganov/whisper.cpp.git
% cd whisper.cpp/bindings/ruby
% rake test
First call of `rake test` builds an extension and downloads a model for testing. After that, you add tests in `tests` directory and modify `ext/ruby_whisper.cpp`.
If something seems wrong on build, running `rake clean` solves some cases.
License
-------
The same to [whisper.cpp][].
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models

View File

@ -1,66 +0,0 @@
require 'rake/clean'
require "bundler/gem_tasks"
require "rake/testtask"
require_relative "extsources"
SOURCES = FileList[]
EXTSOURCES.each do |src|
basename = src.pathmap("%f")
dest = basename == "LICENSE" ? basename : src.pathmap("%{../..,ext}p")
dir = dest.pathmap("%d")
file src
directory dir
file dest => [src, dir] do |t|
cp t.source, t.name
end
SOURCES.include dest
end
CLEAN.include SOURCES
CLEAN.include FileList["ext/**/*.o", "ext/**/*.metal", "ext/**/*.tmp", "ext/whisper.{so,bundle,dll}"]
SRC = FileList["ext/*.{c,cpp,h}"]
task build: SOURCES
directory "pkg"
CLOBBER.include "pkg"
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
SO_FILE = File.join("ext", LIB_NAME)
LIB_FILE = File.join("lib", LIB_NAME)
file "ext/Makefile" => SRC + ["ext/extconf.rb"] + SOURCES do |t|
chdir "ext" do
ruby "extconf.rb"
end
end
file SO_FILE => "ext/Makefile" do |t|
chdir "ext" do
sh "make"
end
end
CLEAN.include SO_FILE
directory "lib"
file LIB_FILE => [SO_FILE, "lib"] do |t|
copy t.source, t.name
end
CLEAN.include LIB_FILE
Rake::TestTask.new do |t|
t.test_files = FileList["tests/test_*.rb"]
end
TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
chdir "tests/jfk_reader" do
ruby "extconf.rb"
sh "make"
end
end
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
task test: [LIB_FILE, TEST_MEMORY_VIEW]

View File

@ -1,11 +1,7 @@
Makefile
whisper.so
ggml.c
ggml.h
whisper.bundle
whisper.dll
scripts/get-flags.mk
*.o
/*/**/*.c
/*/**/*.cpp
/*/**/*.h
/*/**/*.m
/*/**/*.metal
whisper.cpp
whisper.h
dr_wav.h

View File

@ -1,13 +0,0 @@
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/src/ggml-cpu/unary-ops.cpp \
ggml/src/ggml-cpu/binary-ops.cpp \
ggml/src/ggml-cpu/vec.cpp \
ggml/src/ggml-cpu/ops.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@

View File

@ -1,10 +1,13 @@
require 'mkmf'
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
# need to use c++ compiler flags
$CXXFLAGS << ' -std=c++17'
$LDFLAGS << ' -lstdc++'
$CXXFLAGS << ' -std=c++11'
# Set to true when building binary gems
if enable_config('static-stdlib', false)
$LDFLAGS << ' -static-libgcc -static-libstdc++'
@ -15,198 +18,4 @@ if enable_config('march-tune-native', false)
$CXXFLAGS << ' -march=native -mtune=native'
end
if ENV['WHISPER_METAL']
$GGML_METAL ||= true
$DEPRECATE_WARNING ||= true
end
$UNAME_S = `uname -s`.chomp
$UNAME_P = `uname -p`.chomp
$UNAME_M = `uname -m`.chomp
if $UNAME_S == 'Darwin'
unless ENV['GGML_NO_METAL']
$GGML_METAL ||= true
end
$GGML_NO_OPENMP ||= true
end
if $GGML_METAL
$GGML_METAL_EMBED_LIBRARY = true
end
$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples -DGGML_USE_CPU'
$MK_CFLAGS = '-std=c11 -fPIC'
$MK_CXXFLAGS = '-std=c++17 -fPIC'
$MK_NVCCFLAGS = '-std=c++17'
$MK_LDFLAGS = ''
$OBJ_GGML = []
$OBJ_WHISPER = []
$OBJ_COMMON = []
$OBJ_SDL = []
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
if $UNAME_S == 'Linux'
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
end
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
end
if ENV['WHISPER_DEBUG']
$MK_CFLAGS << ' -O0 -g'
$MK_CXXFLAGS << ' -O0 -g'
$MK_LDFLAGS << ' -g'
$MK_NVCCFLAGS << ' -O0 -g'
else
$MK_CPPFLAGS << ' -DNDEBUG'
$MK_CFLAGS << ' -O3'
$MK_CXXFLAGS << ' -O3'
$MK_NVCCFLAGS << ' -O3'
end
$WARN_FLAGS =
' -Wall' <<
' -Wextra' <<
' -Wpedantic' <<
' -Wcast-qual' <<
' -Wno-unused-function'
$MK_CFLAGS <<
$WARN_FLAGS <<
' -Wshadow' <<
' -Wstrict-prototypes' <<
' -Wpointer-arith' <<
' -Wmissing-prototypes' <<
' -Werror=implicit-int' <<
' -Werror=implicit-function-declaration'
$MK_CXXFLAGS <<
$WARN_FLAGS <<
' -Wmissing-declarations' <<
' -Wmissing-noreturn'
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
end
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
$MK_CFLAGS << ' -pthread'
$MK_CXXFLAGS << ' -pthread'
end
unless $_WIN32
$DSO_EXT = '.so'
else
$DSO_EXT = '.dll'
end
unless ENV['RISCV']
if %w[x86_64 i686 amd64].include? $UNAME_M
$HOST_CXXFLAGS ||= ''
$MK_CFLAGS << ' -march=native -mtune=native'
$HOST_CXXFLAGS << ' -march=native -mtune=native'
end
else
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
end
unless ENV['GGML_NO_ACCELERATE']
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
$MK_LDFLAGS << ' -framework Accelerate'
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
end
end
if ENV['GGML_OPENBLAS']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
end
if ENV['GGML_OPENBLAS64']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
end
if $GGML_METAL
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
if ENV['GGML_METAL_NDEBUG']
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
end
if $GGML_METAL_EMBED_LIBRARY
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
end
end
$OBJ_GGML <<
'ggml/src/ggml.o' <<
'ggml/src/ggml-alloc.o' <<
'ggml/src/ggml-backend.o' <<
'ggml/src/ggml-backend-reg.o' <<
'ggml/src/ggml-opt.o' <<
'ggml/src/ggml-quants.o' <<
'ggml/src/ggml-threading.o' <<
'ggml/src/ggml-cpu/ggml-cpu.o' <<
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
'ggml/src/ggml-cpu/ggml-cpu-traits.o' <<
'ggml/src/ggml-cpu/unary-ops.o' <<
'ggml/src/ggml-cpu/binary-ops.o' <<
'ggml/src/ggml-cpu/vec.o' <<
'ggml/src/ggml-cpu/ops.o'
$OBJ_WHISPER <<
'src/whisper.o' <<
'examples/common.o' <<
'examples/common-whisper.o'
$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
$objs <<
"ruby_whisper.o" <<
"ruby_whisper_context.o" <<
"ruby_whisper_transcribe.o" <<
"ruby_whisper_params.o" <<
"ruby_whisper_error.o" <<
"ruby_whisper_segment.o" <<
"ruby_whisper_model.o"
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
create_makefile('whisper')
File.open 'Makefile', 'a' do |file|
file.puts 'include scripts/get-flags.mk'
file.puts 'include cpu.mk'
if $GGML_METAL
file.puts 'include metal.mk'
if $GGML_METAL_EMBED_LIBRARY
file.puts 'include metal-embed.mk'
end
end
end

View File

@ -1,17 +0,0 @@
ggml/src/ggml-metal/ggml-metal-embed.o: \
ggml/src/ggml-metal/ggml-metal.metal \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/src/ggml-common.h
@echo "Embedding Metal library"
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
@rmdir ${TEMP_ASSEMBLY}

View File

@ -1,6 +0,0 @@
ggml/src/ggml-metal/ggml-metal.o: \
ggml/src/ggml-metal/ggml-metal.m \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/include/ggml-metal.h \
ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@

View File

@ -1,164 +0,0 @@
#include <ruby.h>
#include <ruby/memory_view.h>
#include "ruby_whisper.h"
VALUE mWhisper;
VALUE cContext;
VALUE cParams;
VALUE eError;
VALUE cSegment;
VALUE cModel;
ID id_to_s;
ID id_call;
ID id___method__;
ID id_to_enum;
ID id_length;
ID id_next;
ID id_new;
ID id_to_path;
ID id_URI;
ID id_pre_converted_models;
static bool is_log_callback_finalized = false;
// High level API
extern VALUE ruby_whisper_segment_allocate(VALUE klass);
extern void init_ruby_whisper_context(VALUE *mWhisper);
extern void init_ruby_whisper_params(VALUE *mWhisper);
extern void init_ruby_whisper_error(VALUE *mWhisper);
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
extern void init_ruby_whisper_model(VALUE *mWhisper);
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
/*
* call-seq:
* lang_max_id -> Integer
*/
static VALUE ruby_whisper_s_lang_max_id(VALUE self) {
return INT2NUM(whisper_lang_max_id());
}
/*
* call-seq:
* lang_id(lang_name) -> Integer
*/
static VALUE ruby_whisper_s_lang_id(VALUE self, VALUE lang) {
const char * lang_str = StringValueCStr(lang);
const int id = whisper_lang_id(lang_str);
if (-1 == id) {
rb_raise(rb_eArgError, "language not found: %s", lang_str);
}
return INT2NUM(id);
}
/*
* call-seq:
* lang_str(lang_id) -> String
*/
static VALUE ruby_whisper_s_lang_str(VALUE self, VALUE id) {
const int lang_id = NUM2INT(id);
const char * str = whisper_lang_str(lang_id);
if (NULL == str) {
rb_raise(rb_eIndexError, "id %d outside of language id", lang_id);
}
return rb_str_new2(str);
}
/*
* call-seq:
* lang_str(lang_id) -> String
*/
static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
const int lang_id = NUM2INT(id);
const char * str_full = whisper_lang_str_full(lang_id);
if (NULL == str_full) {
rb_raise(rb_eIndexError, "id %d outside of language id", lang_id);
}
return rb_str_new2(str_full);
}
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
is_log_callback_finalized = true;
return Qnil;
}
static void
ruby_whisper_log_callback(enum ggml_log_level level, const char * buffer, void * user_data) {
if (is_log_callback_finalized) {
return;
}
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
VALUE udata = rb_iv_get(mWhisper, "user_data");
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
}
/*
* call-seq:
* log_set ->(level, buffer, user_data) { ... }, user_data -> nil
*/
static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_data) {
VALUE old_callback = rb_iv_get(self, "log_callback");
if (!NIL_P(old_callback)) {
rb_undefine_finalizer(old_callback);
}
rb_iv_set(self, "log_callback", log_callback);
rb_iv_set(self, "user_data", user_data);
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
rb_define_finalizer(log_callback, finalize_log_callback);
whisper_log_set(ruby_whisper_log_callback, NULL);
return Qnil;
}
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
rb_gc_mark(rwm->context);
}
static VALUE ruby_whisper_model_allocate(VALUE klass) {
ruby_whisper_model *rwm;
rwm = ALLOC(ruby_whisper_model);
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
}
void Init_whisper() {
id_to_s = rb_intern("to_s");
id_call = rb_intern("call");
id___method__ = rb_intern("__method__");
id_to_enum = rb_intern("to_enum");
id_length = rb_intern("length");
id_next = rb_intern("next");
id_new = rb_intern("new");
id_to_path = rb_intern("to_path");
id_URI = rb_intern("URI");
id_pre_converted_models = rb_intern("pre_converted_models");
mWhisper = rb_define_module("Whisper");
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN));
rb_define_const(mWhisper, "LOG_LEVEL_ERROR", INT2NUM(GGML_LOG_LEVEL_ERROR));
rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
init_ruby_whisper_context(&mWhisper);
init_ruby_whisper_params(&mWhisper);
init_ruby_whisper_error(&mWhisper);
init_ruby_whisper_segment(&mWhisper, &cContext);
init_ruby_whisper_model(&mWhisper);
rb_require("whisper/model/uri");
}

View File

@ -0,0 +1,426 @@
#include <ruby.h>
#include "ruby_whisper.h"
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#include <cmath>
#include <fstream>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#ifdef __cplusplus
extern "C" {
#endif
#define BOOL_PARAMS_SETTER(self, prop, value) \
ruby_whisper_params *rwp; \
Data_Get_Struct(self, ruby_whisper_params, rwp); \
if (value == Qfalse || value == Qnil) { \
rwp->params.prop = false; \
} else { \
rwp->params.prop = true; \
} \
return value; \
#define BOOL_PARAMS_GETTER(self, prop) \
ruby_whisper_params *rwp; \
Data_Get_Struct(self, ruby_whisper_params, rwp); \
if (rwp->params.prop) { \
return Qtrue; \
} else { \
return Qfalse; \
}
VALUE mWhisper;
VALUE cContext;
VALUE cParams;
static void ruby_whisper_free(ruby_whisper *rw) {
if (rw->context) {
whisper_free(rw->context);
rw->context = NULL;
}
}
static void ruby_whisper_params_free(ruby_whisper_params *rwp) {
}
void rb_whisper_mark(ruby_whisper *rw) {
// call rb_gc_mark on any ruby references in rw
}
void rb_whisper_free(ruby_whisper *rw) {
ruby_whisper_free(rw);
free(rw);
}
void rb_whisper_params_mark(ruby_whisper_params *rwp) {
}
void rb_whisper_params_free(ruby_whisper_params *rwp) {
ruby_whisper_params_free(rwp);
free(rwp);
}
static VALUE ruby_whisper_allocate(VALUE klass) {
ruby_whisper *rw;
rw = ALLOC(ruby_whisper);
rw->context = NULL;
return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
}
static VALUE ruby_whisper_params_allocate(VALUE klass) {
ruby_whisper_params *rwp;
rwp = ALLOC(ruby_whisper_params);
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
}
static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
ruby_whisper *rw;
VALUE whisper_model_file_path;
// TODO: we can support init from buffer here too maybe another ruby object to expose
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
Data_Get_Struct(self, ruby_whisper, rw);
if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
}
rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
if (rw->context == nullptr) {
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
}
return self;
}
/*
* transcribe a single file
* can emit to a block results
*
**/
static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
ruby_whisper *rw;
ruby_whisper_params *rwp;
VALUE wave_file_path, blk, params;
rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
Data_Get_Struct(self, ruby_whisper, rw);
Data_Get_Struct(params, ruby_whisper_params, rwp);
if (!rb_respond_to(wave_file_path, rb_intern("to_s"))) {
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
}
std::string fname_inp = StringValueCStr(wave_file_path);
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
// WAV input - this is directly from main.cpp example
{
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
if (fname_inp == "-") {
{
uint8_t buf[1024];
while (true) {
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return self;
}
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
} else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
return self;
}
if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
return self;
}
if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) {
fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
return self;
}
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
return self;
}
if (wav.bitsPerSample != 16) {
fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str());
return self;
}
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (rwp->diarize) {
// convert to stereo, float
pcmf32s.resize(2);
pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
}
}
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
rwp->params.encoder_begin_callback_user_data = &is_aborted;
}
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
fprintf(stderr, "failed to process audio\n");
return self;
}
const int n_segments = whisper_full_n_segments(rw->context);
VALUE output = rb_str_new2("");
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(rw->context, i);
output = rb_str_concat(output, rb_str_new2(text));
}
VALUE idCall = rb_intern("call");
if (blk != Qnil) {
rb_funcall(blk, idCall, 1, output);
}
return self;
}
/*
* params.language = "auto" | "en", etc...
*/
static VALUE ruby_whisper_params_set_language(VALUE self, VALUE value) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
if (value == Qfalse || value == Qnil) {
rwp->params.language = "auto";
} else {
rwp->params.language = StringValueCStr(value);
}
return value;
}
static VALUE ruby_whisper_params_get_language(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
if (rwp->params.language) {
return rb_str_new2(rwp->params.language);
} else {
return rb_str_new2("auto");
}
}
static VALUE ruby_whisper_params_set_translate(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, translate, value)
}
static VALUE ruby_whisper_params_get_translate(VALUE self) {
BOOL_PARAMS_GETTER(self, translate)
}
static VALUE ruby_whisper_params_set_no_context(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, no_context, value)
}
static VALUE ruby_whisper_params_get_no_context(VALUE self) {
BOOL_PARAMS_GETTER(self, no_context)
}
static VALUE ruby_whisper_params_set_single_segment(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, single_segment, value)
}
static VALUE ruby_whisper_params_get_single_segment(VALUE self) {
BOOL_PARAMS_GETTER(self, single_segment)
}
static VALUE ruby_whisper_params_set_print_special(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, print_special, value)
}
static VALUE ruby_whisper_params_get_print_special(VALUE self) {
BOOL_PARAMS_GETTER(self, print_special)
}
static VALUE ruby_whisper_params_set_print_progress(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, print_progress, value)
}
static VALUE ruby_whisper_params_get_print_progress(VALUE self) {
BOOL_PARAMS_GETTER(self, print_progress)
}
static VALUE ruby_whisper_params_set_print_realtime(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, print_realtime, value)
}
static VALUE ruby_whisper_params_get_print_realtime(VALUE self) {
BOOL_PARAMS_GETTER(self, print_realtime)
}
static VALUE ruby_whisper_params_set_print_timestamps(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, print_timestamps, value)
}
static VALUE ruby_whisper_params_get_print_timestamps(VALUE self) {
BOOL_PARAMS_GETTER(self, print_timestamps)
}
static VALUE ruby_whisper_params_set_suppress_blank(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, suppress_blank, value)
}
static VALUE ruby_whisper_params_get_suppress_blank(VALUE self) {
BOOL_PARAMS_GETTER(self, suppress_blank)
}
static VALUE ruby_whisper_params_set_suppress_non_speech_tokens(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, suppress_non_speech_tokens, value)
}
static VALUE ruby_whisper_params_get_suppress_non_speech_tokens(VALUE self) {
BOOL_PARAMS_GETTER(self, suppress_non_speech_tokens)
}
static VALUE ruby_whisper_params_get_token_timestamps(VALUE self) {
BOOL_PARAMS_GETTER(self, token_timestamps)
}
static VALUE ruby_whisper_params_set_token_timestamps(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, token_timestamps, value)
}
static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
BOOL_PARAMS_GETTER(self, split_on_word)
}
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, split_on_word, value)
}
static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
BOOL_PARAMS_GETTER(self, speed_up)
}
static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, speed_up, value)
}
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
if (rwp->diarize) {
return Qtrue;
} else {
return Qfalse;
}
}
static VALUE ruby_whisper_params_set_diarize(VALUE self, VALUE value) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
if (value == Qfalse || value == Qnil) {
rwp->diarize = false;
} else {
rwp->diarize = true;
} \
return value;
}
static VALUE ruby_whisper_params_get_offset(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
return INT2NUM(rwp->params.offset_ms);
}
static VALUE ruby_whisper_params_set_offset(VALUE self, VALUE value) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
rwp->params.offset_ms = NUM2INT(value);
return value;
}
static VALUE ruby_whisper_params_get_duration(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
return INT2NUM(rwp->params.duration_ms);
}
static VALUE ruby_whisper_params_set_duration(VALUE self, VALUE value) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
rwp->params.duration_ms = NUM2INT(value);
return value;
}
static VALUE ruby_whisper_params_get_max_text_tokens(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
return INT2NUM(rwp->params.n_max_text_ctx);
}
static VALUE ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
rwp->params.n_max_text_ctx = NUM2INT(value);
return value;
}
void Init_whisper() {
mWhisper = rb_define_module("Whisper");
cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
cParams = rb_define_class_under(mWhisper, "Params", rb_cObject);
rb_define_alloc_func(cContext, ruby_whisper_allocate);
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
rb_define_alloc_func(cParams, ruby_whisper_params_allocate);
rb_define_method(cParams, "language=", ruby_whisper_params_set_language, 1);
rb_define_method(cParams, "language", ruby_whisper_params_get_language, 0);
rb_define_method(cParams, "translate=", ruby_whisper_params_set_translate, 1);
rb_define_method(cParams, "translate", ruby_whisper_params_get_translate, 0);
rb_define_method(cParams, "no_context=", ruby_whisper_params_set_no_context, 1);
rb_define_method(cParams, "no_context", ruby_whisper_params_get_no_context, 0);
rb_define_method(cParams, "single_segment=", ruby_whisper_params_set_single_segment, 1);
rb_define_method(cParams, "single_segment", ruby_whisper_params_get_single_segment, 0);
rb_define_method(cParams, "print_special", ruby_whisper_params_get_print_special, 0);
rb_define_method(cParams, "print_special=", ruby_whisper_params_set_print_special, 1);
rb_define_method(cParams, "print_progress", ruby_whisper_params_get_print_progress, 0);
rb_define_method(cParams, "print_progress=", ruby_whisper_params_set_print_progress, 1);
rb_define_method(cParams, "print_realtime", ruby_whisper_params_get_print_realtime, 0);
rb_define_method(cParams, "print_realtime=", ruby_whisper_params_set_print_realtime, 1);
rb_define_method(cParams, "print_timestamps", ruby_whisper_params_get_print_timestamps, 0);
rb_define_method(cParams, "print_timestamps=", ruby_whisper_params_set_print_timestamps, 1);
rb_define_method(cParams, "suppress_blank", ruby_whisper_params_get_suppress_blank, 0);
rb_define_method(cParams, "suppress_blank=", ruby_whisper_params_set_suppress_blank, 1);
rb_define_method(cParams, "suppress_non_speech_tokens", ruby_whisper_params_get_suppress_non_speech_tokens, 0);
rb_define_method(cParams, "suppress_non_speech_tokens=", ruby_whisper_params_set_suppress_non_speech_tokens, 1);
rb_define_method(cParams, "token_timestamps", ruby_whisper_params_get_token_timestamps, 0);
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
rb_define_method(cParams, "offset", ruby_whisper_params_get_offset, 0);
rb_define_method(cParams, "offset=", ruby_whisper_params_set_offset, 1);
rb_define_method(cParams, "duration", ruby_whisper_params_get_duration, 0);
rb_define_method(cParams, "duration=", ruby_whisper_params_set_duration, 1);
rb_define_method(cParams, "max_text_tokens", ruby_whisper_params_get_max_text_tokens, 0);
rb_define_method(cParams, "max_text_tokens=", ruby_whisper_params_set_max_text_tokens, 1);
}
#ifdef __cplusplus
}
#endif

View File

@ -1,15 +1,8 @@
#ifndef RUBY_WHISPER_H
#define RUBY_WHISPER_H
#ifndef __RUBY_WHISPER_H
#define __RUBY_WHISPER_H
#include "whisper.h"
typedef struct {
VALUE *context;
VALUE user_data;
VALUE callback;
VALUE callbacks;
} ruby_whisper_callback_container;
typedef struct {
struct whisper_context *context;
} ruby_whisper;
@ -17,18 +10,6 @@ typedef struct {
typedef struct {
struct whisper_full_params params;
bool diarize;
ruby_whisper_callback_container *new_segment_callback_container;
ruby_whisper_callback_container *progress_callback_container;
ruby_whisper_callback_container *abort_callback_container;
} ruby_whisper_params;
typedef struct {
VALUE context;
int index;
} ruby_whisper_segment;
typedef struct {
VALUE context;
} ruby_whisper_model;
#endif

View File

@ -1,613 +0,0 @@
#include <ruby.h>
#include <ruby/memory_view.h>
#include "ruby_whisper.h"
extern ID id_to_s;
extern ID id___method__;
extern ID id_to_enum;
extern ID id_length;
extern ID id_next;
extern ID id_new;
extern ID id_to_path;
extern ID id_URI;
extern ID id_pre_converted_models;
extern VALUE cContext;
extern VALUE eError;
extern VALUE cModel;
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
extern VALUE rb_whisper_model_initialize(VALUE context);
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
static void
ruby_whisper_free(ruby_whisper *rw)
{
if (rw->context) {
whisper_free(rw->context);
rw->context = NULL;
}
}
void
rb_whisper_mark(ruby_whisper *rw)
{
// call rb_gc_mark on any ruby references in rw
}
void
rb_whisper_free(ruby_whisper *rw)
{
ruby_whisper_free(rw);
free(rw);
}
static VALUE
ruby_whisper_allocate(VALUE klass)
{
ruby_whisper *rw;
rw = ALLOC(ruby_whisper);
rw->context = NULL;
return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
}
/*
* call-seq:
* new("base.en") -> Whisper::Context
* new("path/to/model.bin") -> Whisper::Context
* new(Whisper::Model::URI.new("https://example.net/uri/of/model.bin")) -> Whisper::Context
*/
static VALUE
ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
{
ruby_whisper *rw;
VALUE whisper_model_file_path;
// TODO: we can support init from buffer here too maybe another ruby object to expose
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
Data_Get_Struct(self, ruby_whisper, rw);
VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
if (!NIL_P(pre_converted_model)) {
whisper_model_file_path = pre_converted_model;
}
if (TYPE(whisper_model_file_path) == T_STRING) {
const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
VALUE uri_class = rb_const_get(cModel, id_URI);
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
}
}
if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
VALUE uri_class = rb_const_get(cModel, id_URI);
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
}
if (rb_respond_to(whisper_model_file_path, id_to_path)) {
whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
}
if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
}
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
if (rw->context == NULL) {
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
}
return self;
}
/*
* call-seq:
* model_n_vocab -> Integer
*/
VALUE ruby_whisper_model_n_vocab(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
/*
* call-seq:
* model_n_audio_ctx -> Integer
*/
VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
/*
* call-seq:
* model_n_audio_state -> Integer
*/
VALUE ruby_whisper_model_n_audio_state(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
/*
* call-seq:
* model_n_audio_head -> Integer
*/
VALUE ruby_whisper_model_n_audio_head(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
/*
* call-seq:
* model_n_audio_layer -> Integer
*/
VALUE ruby_whisper_model_n_audio_layer(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
/*
* call-seq:
* model_n_text_ctx -> Integer
*/
VALUE ruby_whisper_model_n_text_ctx(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
/*
* call-seq:
* model_n_text_state -> Integer
*/
VALUE ruby_whisper_model_n_text_state(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
/*
* call-seq:
* model_n_text_head -> Integer
*/
VALUE ruby_whisper_model_n_text_head(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
/*
* call-seq:
* model_n_text_layer -> Integer
*/
VALUE ruby_whisper_model_n_text_layer(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
/*
* call-seq:
* model_n_mels -> Integer
*/
VALUE ruby_whisper_model_n_mels(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
/*
* call-seq:
* model_ftype -> Integer
*/
VALUE ruby_whisper_model_ftype(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
/*
* call-seq:
* model_type -> String
*/
VALUE ruby_whisper_model_type(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
/*
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
* Not thread safe for same context
* Uses the specified decoding strategy to obtain the text.
*
* call-seq:
* full(params, samples, n_samples) -> nil
* full(params, samples) -> nil
*
* The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
*/
VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
{
if (argc < 2 || argc > 3) {
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
}
ruby_whisper *rw;
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper, rw);
VALUE params = argv[0];
Data_Get_Struct(params, ruby_whisper_params, rwp);
VALUE samples = argv[1];
int n_samples;
rb_memory_view_t view;
const bool memory_view_available_p = rb_memory_view_available_p(samples);
if (argc == 3) {
n_samples = NUM2INT(argv[2]);
if (TYPE(samples) == T_ARRAY) {
if (RARRAY_LEN(samples) < n_samples) {
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
}
}
// Should check when samples.respond_to?(:length)?
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
} else if (memory_view_available_p) {
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
}
}
float * c_samples = (float *)malloc(n_samples * sizeof(float));
if (memory_view_available_p) {
c_samples = (float *)view.data;
} else {
if (TYPE(samples) == T_ARRAY) {
for (int i = 0; i < n_samples; i++) {
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
}
} else {
// TODO: use rb_block_call
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
for (int i = 0; i < n_samples; i++) {
// TODO: check if iter is exhausted and raise ArgumentError appropriately
VALUE sample = rb_funcall(iter, id_next, 0);
c_samples[i] = RFLOAT_VALUE(sample);
}
}
}
register_callbacks(rwp, &self);
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
if (0 == result) {
return self;
} else {
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
}
}
/*
* Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
* Result is stored in the default state of the context
* Not thread safe if executed in parallel on the same context.
* It seems this approach can offer some speedup in some cases.
* However, the transcription accuracy can be worse at the beginning and end of each chunk.
*
* call-seq:
* full_parallel(params, samples) -> nil
* full_parallel(params, samples, n_samples) -> nil
* full_parallel(params, samples, n_samples, n_processors) -> nil
* full_parallel(params, samples, nil, n_processors) -> nil
*/
static VALUE
ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
{
if (argc < 2 || argc > 4) {
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
}
ruby_whisper *rw;
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper, rw);
VALUE params = argv[0];
Data_Get_Struct(params, ruby_whisper_params, rwp);
VALUE samples = argv[1];
int n_samples;
int n_processors;
rb_memory_view_t view;
const bool memory_view_available_p = rb_memory_view_available_p(samples);
switch (argc) {
case 2:
n_processors = 1;
break;
case 3:
n_processors = 1;
break;
case 4:
n_processors = NUM2INT(argv[3]);
break;
}
if (argc >= 3 && !NIL_P(argv[2])) {
n_samples = NUM2INT(argv[2]);
if (TYPE(samples) == T_ARRAY) {
if (RARRAY_LEN(samples) < n_samples) {
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
}
}
// Should check when samples.respond_to?(:length)?
} else if (memory_view_available_p) {
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
}
}
float * c_samples = (float *)malloc(n_samples * sizeof(float));
if (memory_view_available_p) {
c_samples = (float *)view.data;
} else {
if (TYPE(samples) == T_ARRAY) {
for (int i = 0; i < n_samples; i++) {
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
}
} else {
// FIXME: use rb_block_call
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
for (int i = 0; i < n_samples; i++) {
// TODO: check if iter is exhausted and raise ArgumentError
VALUE sample = rb_funcall(iter, id_next, 0);
c_samples[i] = RFLOAT_VALUE(sample);
}
}
}
register_callbacks(rwp, &self);
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
if (0 == result) {
return self;
} else {
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
}
}
/*
* Number of segments.
*
* call-seq:
* full_n_segments -> Integer
*/
static VALUE
ruby_whisper_full_n_segments(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_full_n_segments(rw->context));
}
/*
* Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
*
* call-seq:
* full_lang_id -> Integer
*/
static VALUE
ruby_whisper_full_lang_id(VALUE self)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_full_lang_id(rw->context));
}
static int ruby_whisper_full_check_segment_index(const ruby_whisper * rw, const VALUE i_segment)
{
const int c_i_segment = NUM2INT(i_segment);
if (c_i_segment < 0 || c_i_segment >= whisper_full_n_segments(rw->context)) {
rb_raise(rb_eIndexError, "segment index %d out of range", c_i_segment);
}
return c_i_segment;
}
/*
* Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
*
* full_get_segment_t0(3) # => 1668 (16680 ms)
*
* call-seq:
* full_get_segment_t0(segment_index) -> Integer
*/
static VALUE
ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
return INT2NUM(t0);
}
/*
* End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
*
* full_get_segment_t1(3) # => 1668 (16680 ms)
*
* call-seq:
* full_get_segment_t1(segment_index) -> Integer
*/
static VALUE
ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
return INT2NUM(t1);
}
/*
* Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
*
* full_get_segment_speacker_turn_next(3) # => true
*
* call-seq:
* full_get_segment_speacker_turn_next(segment_index) -> bool
*/
static VALUE
ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
return speaker_turn_next ? Qtrue : Qfalse;
}
/*
* Text of a segment indexed by +segment_index+.
*
* full_get_segment_text(3) # => "ask not what your country can do for you, ..."
*
* call-seq:
* full_get_segment_text(segment_index) -> String
*/
static VALUE
ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
return rb_str_new2(text);
}
/*
* call-seq:
* full_get_segment_no_speech_prob(segment_index) -> Float
*/
static VALUE
ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
return DBL2NUM(no_speech_prob);
}
// High level API
static VALUE
ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
{
return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
}
/*
* Yields each Whisper::Segment:
*
* whisper.transcribe("path/to/audio.wav", params)
* whisper.each_segment do |segment|
* puts segment.text
* end
*
* Returns an Enumerator if no block given:
*
* whisper.transcribe("path/to/audio.wav", params)
* enum = whisper.each_segment
* enum.to_a # => [#<Whisper::Segment>, ...]
*
* call-seq:
* each_segment {|segment| ... }
* each_segment -> Enumerator
*/
static VALUE
ruby_whisper_each_segment(VALUE self)
{
if (!rb_block_given_p()) {
const VALUE method_name = rb_funcall(self, id___method__, 0);
return rb_funcall(self, id_to_enum, 1, method_name);
}
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
const int n_segments = whisper_full_n_segments(rw->context);
for (int i = 0; i < n_segments; ++i) {
rb_yield(rb_whisper_segment_initialize(self, i));
}
return self;
}
/*
* call-seq:
* model -> Whisper::Model
*/
static VALUE
ruby_whisper_get_model(VALUE self)
{
return rb_whisper_model_initialize(self);
}
void
init_ruby_whisper_context(VALUE *mWhisper)
{
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
rb_define_alloc_func(cContext, ruby_whisper_allocate);
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
rb_define_method(cContext, "model_n_vocab", ruby_whisper_model_n_vocab, 0);
rb_define_method(cContext, "model_n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
rb_define_method(cContext, "model_n_audio_state", ruby_whisper_model_n_audio_state, 0);
rb_define_method(cContext, "model_n_audio_head", ruby_whisper_model_n_audio_head, 0);
rb_define_method(cContext, "model_n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
rb_define_method(cContext, "model_n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
rb_define_method(cContext, "model_n_text_state", ruby_whisper_model_n_text_state, 0);
rb_define_method(cContext, "model_n_text_head", ruby_whisper_model_n_text_head, 0);
rb_define_method(cContext, "model_n_text_layer", ruby_whisper_model_n_text_layer, 0);
rb_define_method(cContext, "model_n_mels", ruby_whisper_model_n_mels, 0);
rb_define_method(cContext, "model_ftype", ruby_whisper_model_ftype, 0);
rb_define_method(cContext, "model_type", ruby_whisper_model_type, 0);
rb_define_method(cContext, "full_n_segments", ruby_whisper_full_n_segments, 0);
rb_define_method(cContext, "full_lang_id", ruby_whisper_full_lang_id, 0);
rb_define_method(cContext, "full_get_segment_t0", ruby_whisper_full_get_segment_t0, 1);
rb_define_method(cContext, "full_get_segment_t1", ruby_whisper_full_get_segment_t1, 1);
rb_define_method(cContext, "full_get_segment_speaker_turn_next", ruby_whisper_full_get_segment_speaker_turn_next, 1);
rb_define_method(cContext, "full_get_segment_text", ruby_whisper_full_get_segment_text, 1);
rb_define_method(cContext, "full_get_segment_no_speech_prob", ruby_whisper_full_get_segment_no_speech_prob, 1);
rb_define_method(cContext, "full", ruby_whisper_full, -1);
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
// High leve
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
}

View File

@ -1,52 +0,0 @@
#include <ruby.h>
extern VALUE eError;
VALUE ruby_whisper_error_initialize(VALUE self, VALUE code)
{
const int c_code = NUM2INT(code);
const char *raw_message;
switch (c_code) {
case -2:
raw_message = "failed to compute log mel spectrogram";
break;
case -3:
raw_message = "failed to auto-detect language";
break;
case -4:
raw_message = "too many decoders requested";
break;
case -5:
raw_message = "audio_ctx is larger than the maximum allowed";
break;
case -6:
raw_message = "failed to encode";
break;
case -7:
raw_message = "whisper_kv_cache_init() failed for self-attention cache";
break;
case -8:
raw_message = "failed to decode";
break;
case -9:
raw_message = "failed to decode";
break;
default:
raw_message = "unknown error";
break;
}
const VALUE message = rb_str_new2(raw_message);
rb_call_super(1, &message);
rb_iv_set(self, "@code", code);
return self;
}
void
init_ruby_whisper_error(VALUE *mWhisper)
{
eError = rb_define_class_under(*mWhisper, "Error", rb_eStandardError);
rb_define_attr(eError, "code", true, false);
rb_define_method(eError, "initialize", ruby_whisper_error_initialize, 1);
}

View File

@ -1,210 +0,0 @@
#include <ruby.h>
#include "ruby_whisper.h"
extern VALUE cModel;
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
rb_gc_mark(rwm->context);
}
static VALUE ruby_whisper_model_allocate(VALUE klass) {
ruby_whisper_model *rwm;
rwm = ALLOC(ruby_whisper_model);
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
}
VALUE rb_whisper_model_initialize(VALUE context) {
ruby_whisper_model *rwm;
const VALUE model = ruby_whisper_model_allocate(cModel);
Data_Get_Struct(model, ruby_whisper_model, rwm);
rwm->context = context;
return model;
};
/*
* call-seq:
* n_vocab -> Integer
*/
static VALUE
ruby_whisper_model_n_vocab(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
/*
* call-seq:
* n_audio_ctx -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_ctx(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
/*
* call-seq:
* n_audio_state -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_state(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
/*
* call-seq:
* n_audio_head -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_head(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
/*
* call-seq:
* n_audio_layer -> Integer
*/
static VALUE
ruby_whisper_model_n_audio_layer(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
/*
* call-seq:
* n_text_ctx -> Integer
*/
static VALUE
ruby_whisper_model_n_text_ctx(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
/*
* call-seq:
* n_text_state -> Integer
*/
static VALUE
ruby_whisper_model_n_text_state(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
/*
* call-seq:
* n_text_head -> Integer
*/
static VALUE
ruby_whisper_model_n_text_head(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
/*
* call-seq:
* n_text_layer -> Integer
*/
static VALUE
ruby_whisper_model_n_text_layer(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
/*
* call-seq:
* n_mels -> Integer
*/
static VALUE
ruby_whisper_model_n_mels(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
/*
* call-seq:
* ftype -> Integer
*/
static VALUE
ruby_whisper_model_ftype(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
/*
* call-seq:
* type -> String
*/
static VALUE
ruby_whisper_model_type(VALUE self)
{
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
void
init_ruby_whisper_model(VALUE *mWhisper)
{
cModel = rb_define_class_under(*mWhisper, "Model", rb_cObject);
rb_define_alloc_func(cModel, ruby_whisper_model_allocate);
rb_define_method(cModel, "n_vocab", ruby_whisper_model_n_vocab, 0);
rb_define_method(cModel, "n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
rb_define_method(cModel, "n_audio_state", ruby_whisper_model_n_audio_state, 0);
rb_define_method(cModel, "n_audio_head", ruby_whisper_model_n_audio_head, 0);
rb_define_method(cModel, "n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
rb_define_method(cModel, "n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
rb_define_method(cModel, "n_text_state", ruby_whisper_model_n_text_state, 0);
rb_define_method(cModel, "n_text_head", ruby_whisper_model_n_text_head, 0);
rb_define_method(cModel, "n_text_layer", ruby_whisper_model_n_text_layer, 0);
rb_define_method(cModel, "n_mels", ruby_whisper_model_n_mels, 0);
rb_define_method(cModel, "ftype", ruby_whisper_model_ftype, 0);
rb_define_method(cModel, "type", ruby_whisper_model_type, 0);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,123 +0,0 @@
#include <ruby.h>
#include "ruby_whisper.h"
extern VALUE cSegment;
static void
rb_whisper_segment_mark(ruby_whisper_segment *rws)
{
rb_gc_mark(rws->context);
}
VALUE
ruby_whisper_segment_allocate(VALUE klass)
{
ruby_whisper_segment *rws;
rws = ALLOC(ruby_whisper_segment);
return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
}
VALUE
rb_whisper_segment_initialize(VALUE context, int index)
{
ruby_whisper_segment *rws;
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
Data_Get_Struct(segment, ruby_whisper_segment, rws);
rws->context = context;
rws->index = index;
return segment;
};
/*
* Start time in milliseconds.
*
* call-seq:
* start_time -> Integer
*/
static VALUE
ruby_whisper_segment_get_start_time(VALUE self)
{
ruby_whisper_segment *rws;
Data_Get_Struct(self, ruby_whisper_segment, rws);
ruby_whisper *rw;
Data_Get_Struct(rws->context, ruby_whisper, rw);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return INT2NUM(t0 * 10);
}
/*
* End time in milliseconds.
*
* call-seq:
* end_time -> Integer
*/
static VALUE
ruby_whisper_segment_get_end_time(VALUE self)
{
ruby_whisper_segment *rws;
Data_Get_Struct(self, ruby_whisper_segment, rws);
ruby_whisper *rw;
Data_Get_Struct(rws->context, ruby_whisper, rw);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return INT2NUM(t1 * 10);
}
/*
* Whether the next segment is predicted as a speaker turn.
*
* call-seq:
* speaker_turn_next? -> bool
*/
static VALUE
ruby_whisper_segment_get_speaker_turn_next(VALUE self)
{
ruby_whisper_segment *rws;
Data_Get_Struct(self, ruby_whisper_segment, rws);
ruby_whisper *rw;
Data_Get_Struct(rws->context, ruby_whisper, rw);
return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
}
/*
* call-seq:
* text -> String
*/
static VALUE
ruby_whisper_segment_get_text(VALUE self)
{
ruby_whisper_segment *rws;
Data_Get_Struct(self, ruby_whisper_segment, rws);
ruby_whisper *rw;
Data_Get_Struct(rws->context, ruby_whisper, rw);
const char * text = whisper_full_get_segment_text(rw->context, rws->index);
return rb_str_new2(text);
}
/*
* call-seq:
* no_speech_prob -> Float
*/
static VALUE
ruby_whisper_segment_get_no_speech_prob(VALUE self)
{
ruby_whisper_segment *rws;
Data_Get_Struct(self, ruby_whisper_segment, rws);
ruby_whisper *rw;
Data_Get_Struct(rws->context, ruby_whisper, rw);
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
}
void
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
{
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
}

View File

@ -1,83 +0,0 @@
#include <ruby.h>
#include "ruby_whisper.h"
#include "common-whisper.h"
#include <string>
#include <vector>
#ifdef __cplusplus
extern "C" {
#endif
extern ID id_to_s;
extern ID id_call;
extern void
register_callbacks(ruby_whisper_params * rwp, VALUE * self);
/*
* transcribe a single file
* can emit to a block results
*
* params = Whisper::Params.new
* params.duration = 60_000
* whisper.transcribe "path/to/audio.wav", params do |text|
* puts text
* end
*
* call-seq:
* transcribe(path_to_audio, params) {|text| ...}
**/
VALUE
ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
ruby_whisper *rw;
ruby_whisper_params *rwp;
VALUE wave_file_path, blk, params;
rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
Data_Get_Struct(self, ruby_whisper, rw);
Data_Get_Struct(params, ruby_whisper_params, rwp);
if (!rb_respond_to(wave_file_path, id_to_s)) {
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
}
std::string fname_inp = StringValueCStr(wave_file_path);
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
if (!read_audio_data(fname_inp, pcmf32, pcmf32s, rwp->diarize)) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
return self;
}
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
rwp->params.encoder_begin_callback_user_data = &is_aborted;
}
register_callbacks(rwp, &self);
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
fprintf(stderr, "failed to process audio\n");
return self;
}
const int n_segments = whisper_full_n_segments(rw->context);
VALUE output = rb_str_new2("");
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(rw->context, i);
output = rb_str_concat(output, rb_str_new2(text));
}
VALUE idCall = id_call;
if (blk != Qnil) {
rb_funcall(blk, idCall, 1, output);
}
return self;
}
#ifdef __cplusplus
}
#endif

View File

@ -1,6 +0,0 @@
require "yaml"
sources = `git ls-files -z ../..`.split("\x0")
paths = YAML.load_file("../../.github/workflows/bindings-ruby.yml")[true]["push"]["paths"]
paths.delete "bindings/ruby/**"
EXTSOURCES = (Dir.glob(paths, base: "../..").collect {|path| "../../#{path}"} << "../../LICENSE") & sources

View File

@ -1,170 +0,0 @@
require "uri"
require "net/http"
require "time"
require "pathname"
require "io/console/size"
module Whisper
class Model
class URI
def initialize(uri)
@uri = URI(uri)
end
def to_path
cache
cache_path.to_path
end
def clear_cache
path = cache_path
path.delete if path.exist?
end
private
def cache_path
base_cache_dir/@uri.host/@uri.path[1..]
end
def base_cache_dir
base = case RUBY_PLATFORM
when /mswin|mingw/
ENV.key?("LOCALAPPDATA") ? Pathname(ENV["LOCALAPPDATA"]) : Pathname(Dir.home)/"AppData/Local"
when /darwin/
Pathname(Dir.home)/"Library/Caches"
else
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
end
base/"whisper.cpp"
end
def cache
path = cache_path
headers = {}
headers["if-modified-since"] = path.mtime.httpdate if path.exist?
request @uri, headers
path
end
def request(uri, headers)
Net::HTTP.start uri.host, uri.port, use_ssl: uri.scheme == "https" do |http|
request = Net::HTTP::Get.new(uri, headers)
http.request request do |response|
case response
when Net::HTTPNotModified
# noop
when Net::HTTPOK
download response
when Net::HTTPRedirection
request URI(response["location"]), headers
else
return if headers.key?("if-modified-since") # Use cache file
raise "#{response.code} #{response.message}\n#{response.body}"
end
end
end
rescue => err
if cache_path.exist?
warn err
# Use cache file
else
raise
end
end
def download(response)
path = cache_path
path.dirname.mkpath unless path.dirname.exist?
downloading_path = Pathname("#{path}.downloading")
size = response.content_length
downloading_path.open "wb" do |file|
downloaded = 0
response.read_body do |chunk|
file << chunk
downloaded += chunk.bytesize
show_progress downloaded, size
end
$stderr.puts
end
downloading_path.rename path
end
def show_progress(current, size)
progress_rate_available = size && $stderr.tty?
unless @prev
@prev = Time.now
$stderr.puts "Downloading #{@uri} to #{cache_path}"
end
now = Time.now
if progress_rate_available
return if now - @prev < 1 && current < size
progress_width = 20
progress = current.to_f / size
arrow_length = progress * progress_width
arrow = "=" * (arrow_length - 1) + ">" + " " * (progress_width - arrow_length)
line = "[#{arrow}] (#{format_bytesize(current)} / #{format_bytesize(size)})"
padding = ' ' * ($stderr.winsize[1] - line.size)
$stderr.print "\r#{line}#{padding}"
else
return if now - @prev < 1
$stderr.print "."
end
@prev = now
end
def format_bytesize(bytesize)
return "0.0 B" if bytesize.zero?
units = %w[B KiB MiB GiB TiB]
exp = (Math.log(bytesize) / Math.log(1024)).to_i
format("%.1f %s", bytesize.to_f / 1024 ** exp, units[exp])
end
end
@pre_converted_models = %w[
tiny
tiny.en
tiny-q5_1
tiny.en-q5_1
tiny-q8_0
base
base.en
base-q5_1
base.en-q5_1
base-q8_0
small
small.en
small.en-tdrz
small-q5_1
small.en-q5_1
small-q8_0
medium
medium.en
medium-q5_0
medium.en-q5_0
medium-q8_0
large-v1
large-v2
large-v2-q5_0
large-v2-q8_0
large-v3
large-v3-q5_0
large-v3-turbo
large-v3-turbo-q5_0
large-v3-turbo-q8_0
].each_with_object({}) {|name, models|
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
}
class << self
attr_reader :pre_converted_models
end
end
end

View File

@ -1,189 +0,0 @@
module Whisper
interface _Samples
def length: () -> Integer
def each: { (Float) -> void } -> void
end
type log_callback = ^(Integer level, String message, Object user_data) -> void
type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
LOG_LEVEL_NONE: Integer
LOG_LEVEL_INFO: Integer
LOG_LEVEL_WARN: Integer
LOG_LEVEL_ERROR: Integer
LOG_LEVEL_DEBUG: Integer
LOG_LEVEL_CONT: Integer
def self.lang_max_id: () -> Integer
def self.lang_id: (string name) -> Integer
def self.lang_str: (Integer id) -> String
def self.lang_str_full: (Integer id) -> String
def self.log_set: (log_callback, Object? user_data) -> log_callback
class Context
def self.new: (string | _ToPath | ::URI::HTTP) -> instance
def transcribe: (string, Params) -> self
| (string, Params) { (String) -> void } -> self
def model_n_vocab: () -> Integer
def model_n_audio_ctx: () -> Integer
def model_n_audio_state: () -> Integer
def model_n_text_head: () -> Integer
def model_n_text_layer: () -> Integer
def model_n_mels: () -> Integer
def model_ftype: () -> Integer
def model_type: () -> String
def each_segment: { (Segment) -> void } -> void
| () -> Enumerator[Segment]
def model: () -> Model
def full_get_segment: (Integer nth) -> Segment
def full_n_segments: () -> Integer
def full_lang_id: () -> Integer
def full_get_segment_t0: (Integer) -> Integer
def full_get_segment_t1: (Integer) -> Integer
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
def full_get_segment_text: (Integer) -> String
def full_get_segment_no_speech_prob: (Integer) -> Float
def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
| (Params, _Samples, ?Integer n_samples) -> self
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
| (Params, _Samples, ?Integer n_samples) -> self
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
end
class Params
def self.new: (
?language: string,
?translate: boolish,
?no_context: boolish,
?single_segment: boolish,
?print_special: boolish,
?print_progress: boolish,
?print_realtime: boolish,
?print_timestamps: boolish,
?suppress_blank: boolish,
?suppress_nst: boolish,
?token_timestamps: boolish,
?split_on_word: boolish,
?initial_prompt: string | nil,
?diarize: boolish,
?offset: Integer,
?duration: Integer,
?max_text_tokens: Integer,
?temperature: Float,
?max_initial_ts: Float,
?length_penalty: Float,
?temperature_inc: Float,
?entropy_thold: Float,
?logprob_thold: Float,
?no_speech_thold: Float,
?new_segment_callback: new_segment_callback,
?new_segment_callback_user_data: Object,
?progress_callback: progress_callback,
?progress_callback_user_data: Object,
?abort_callback: abort_callback,
?abort_callback_user_data: Object
) -> instance
def language=: (String) -> String # TODO: Enumerate lang names
def language: () -> String
def translate=: (boolish) -> boolish
def translate: () -> (true | false)
def no_context=: (boolish) -> boolish
def no_context: () -> (true | false)
def single_segment=: (boolish) -> boolish
def single_segment: () -> (true | false)
def print_special=: (boolish) -> boolish
def print_special: () -> (true | false)
def print_progress=: (boolish) -> boolish
def print_progress: () -> (true | false)
def print_realtime=: (boolish) -> boolish
def print_realtime: () -> (true | false)
def print_timestamps=: (boolish) -> boolish
def print_timestamps: () -> (true | false)
def suppress_blank=: (boolish) -> boolish
def suppress_blank: () -> (true | false)
def suppress_nst=: (boolish) -> boolish
def suppress_nst: () -> (true | false)
def token_timestamps=: (boolish) -> boolish
def token_timestamps: () -> (true | false)
def split_on_word=: (boolish) -> boolish
def split_on_word: () -> (true | false)
def initial_prompt=: (_ToS) -> _ToS
def initial_prompt: () -> (String | nil)
def diarize=: (boolish) -> boolish
def diarize: () -> (true | false)
def offset=: (Integer) -> Integer
def offset: () -> Integer
def duration=: (Integer) -> Integer
def duration: () -> Integer
def max_text_tokens=: (Integer) -> Integer
def max_text_tokens: () -> Integer
def temperature=: (Float) -> Float
def temperature: () -> Float
def max_initial_ts=: (Float) -> Float
def max_initial_ts: () -> Float
def length_penalty=: (Float) -> Float
def length_penalty: () -> Float
def temperature_inc=: (Float) -> Float
def temperature_inc: () -> Float
def entropy_thold=: (Float) -> Float
def entropy_thold: () -> Float
def logprob_thold=: (Float) -> Float
def logprob_thold: () -> Float
def no_speech_thold=: (Float) -> Float
def no_speech_thold: () -> Float
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
def new_segment_callback: () -> (new_segment_callback | nil)
def new_segment_callback_user_data=: (Object) -> Object
def new_segment_callback_user_data: () -> Object
def progress_callback=: (progress_callback) -> progress_callback
def progress_callback: () -> (progress_callback | nil)
def progress_callback_user_data=: (Object) -> Object
def progress_callback_user_data: () -> Object
def abort_callback=: (abort_callback) -> abort_callback
def abort_callback: () -> (abort_callback | nil)
def abort_callback_user_data=: (Object) -> Object
def abort_callback_user_data: () -> Object
def on_new_segment: { (Segment) -> void } -> void
def on_progress: { (Integer progress) -> void } -> void
def abort_on: { (Object user_data) -> boolish } -> void
end
class Model
def self.pre_converted_models: () -> Hash[String, Model::URI]
def self.new: () -> instance
def n_vocab: () -> Integer
def n_audio_ctx: () -> Integer
def n_audio_state: () -> Integer
def n_audio_head: () -> Integer
def n_audio_layer: () -> Integer
def n_text_ctx: () -> Integer
def n_text_state: () -> Integer
def n_text_head: () -> Integer
def n_text_layer: () -> Integer
def n_mels: () -> Integer
def ftype: () -> Integer
def type: () -> String
class URI
def self.new: (string | ::URI::HTTP) -> self
def to_path: -> String
def clear_cache: -> void
end
end
class Segment
def start_time: () -> Integer
def end_time: () -> Integer
def speaker_next_turn?: () -> (true | false)
def text: () -> String
def no_speech_prob: () -> Float
end
class Error < StandardError
attr_reader code: Integer
def self.new: (Integer code) -> instance
end
end

View File

@ -1,24 +0,0 @@
require "test/unit"
require "whisper"
require_relative "jfk_reader/jfk_reader"
class TestBase < Test::Unit::TestCase
AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new("base.en")
params = Whisper::Params.new
params.print_timestamps = false
@whisper.transcribe(TestBase::AUDIO, params)
end
end
private
def whisper
self.class.whisper
end
end

View File

@ -1,5 +0,0 @@
Makefile
jfk_reader.o
jfk_reader.so
jfk_reader.bundle
jfk_reader.dll

Some files were not shown because too many files have changed in this diff Show More