whisper : quantize encoder only

2025-06-24 17:15:19 +00:00 · 2023-11-16 16:19:02 +02:00
21 changed files with 336 additions and 34887 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -320,13 +320,6 @@ jobs:
          cd ./build
          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

-      - name: Copy CUDA DLLs
-        run: >
-          Copy-Item -PassThru
-          -Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
-          -Include cudart64_*,cublas64_*,cublasLt64_*
-          -Destination build/bin/${{ matrix.build }}
-
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
--- a/.gitignore
+++ b/.gitignore
@ -31,7 +31,6 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
-/server
 /lsp

 arm_neon.h
--- a/7
+++ b/7
@ -1,4 +1,4 @@
-default: main bench quantize server
+default: main bench quantize

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -338,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so

 #
 # Examples
@ -359,9 +359,6 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
 quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

-server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
-
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

--- a/bindings/ios
+++ b/bindings/ios
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -65,7 +65,6 @@ elseif(CMAKE_JS_VERSION)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
-    add_subdirectory(server)
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(quantize)
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -139,13 +139,10 @@ void audio_async::callback(uint8_t * stream, int len) {
        return;
    }

-    size_t n_samples = len / sizeof(float);
+    const size_t n_samples = len / sizeof(float);

-    if (n_samples > m_audio.size()) {
-        n_samples = m_audio.size();
-
-        stream += (len - (n_samples * sizeof(float)));
-    }
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));

    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);

@ -156,7 +153,7 @@ void audio_async::callback(uint8_t * stream, int len) {
            const size_t n0 = m_audio.size() - m_audio_pos;

            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));

            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -41,6 +41,7 @@ private:
    std::mutex       m_mutex;

    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
-        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score       = true; }
-        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
+        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
+        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -162,6 +162,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        "encoder.conv2.bias",
        "encoder.positional_embedding",
        "decoder.positional_embedding",
+        "decoder.*",
    };

    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,6 +0,0 @@
-set(TARGET server)
-add_executable(${TARGET} server.cpp httplib.h json.hpp)
-
-include(DefaultTargetOptions)
-
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -1,59 +0,0 @@
-# whisper.cpp http server
-
-Simple http server. WAV Files are passed to the inference model via http requests.
-
-```
-./server -h
-
-usage: ./bin/server [options]
-
-options:
-  -h,        --help              [default] show this help message and exit
-  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -sow,      --split-on-word     [false  ] split on word rather than on token
-  -bo N,     --best-of N         [2      ] number of best candidates to keep
-  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
-  -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
-  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
-  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -ps,       --print-special     [false  ] print special tokens
-  -pc,       --print-colors      [false  ] print colors
-  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [false  ] do not print timestamps
-  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-  -dl,       --detect-language   [false  ] exit after automatically detecting language
-             --prompt PROMPT     [       ] initial prompt
-  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
-  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
-  --port PORT,                   [8080   ] Port number for the server
-```
-
-## request examples
-
-**/inference**
-```
-curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@<file-path>" \
-F temperature="0.2" \
-F response-format="json"
-```
-
-**/load**
-```
-curl 127.0.0.1:8080/load \
-H "Content-Type: multipart/form-data" \
-F model="<path-to-model-file>"
-```
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,699 +0,0 @@
-#include "common.h"
-
-#include "whisper.h"
-#include "httplib.h"
-#include "json.hpp"
-
-#include <cmath>
-#include <fstream>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <vector>
-#include <cstring>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-using namespace httplib;
-using json = nlohmann::json;
-
-namespace {
-
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
-const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
-};
-
-// output formats
-const std::string json_format   = "json";
-const std::string text_format   = "text";
-const std::string srt_format    = "srt";
-const std::string vjson_format  = "verbose_json";
-const std::string vtt_format    = "vtt";
-
-struct server_params
-{
-    std::string hostname = "127.0.0.1";
-    std::string public_path = "examples/server/public";
-
-    int32_t port          = 8080;
-    int32_t read_timeout  = 600;
-    int32_t write_timeout = 600;
-};
-
-struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
-    int32_t progress_step =  5;
-    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      =  2;
-    int32_t beam_size    = -1;
-
-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
-    float userdef_temp  =  0.20f;
-
-    bool speed_up        = false;
-    bool debug_mode      = false;
-    bool translate       = false;
-    bool detect_language = false;
-    bool diarize         = false;
-    bool tinydiarize     = false;
-    bool split_on_word   = false;
-    bool no_fallback     = false;
-    bool print_special   = false;
-    bool print_colors    = false;
-    bool print_progress  = false;
-    bool no_timestamps   = false;
-    bool use_gpu         = true;
-
-    std::string language        = "en";
-    std::string prompt          = "";
-    std::string font_path       = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-    std::string model           = "models/ggml-base.en.bin";
-
-    std::string response_format     = json_format;
-
-    // [TDRZ] speaker turn string
-    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
-
-    std::string openvino_encode_device = "CPU";
-};
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-bool is_file_exist(const char *fileName)
-{
-    std::ifstream infile(fileName);
-    return infile.good();
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
-                         const server_params& sparams) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options] \n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
-    // server params
-    fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
-    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
-    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
-    fprintf(stderr, "\n");
-}
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params, sparams);
-            exit(0);
-        }
-        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(argv[++i]); }
-        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(argv[++i]); }
-        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(argv[++i]); }
-        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
-        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
-        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
-        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
-        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
-        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
-        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
-        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
-        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
-        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
-        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
-        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
-        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
-        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
-        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
-        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
-        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
-        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
-        // server params
-        else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
-        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
-        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params, sparams);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-struct whisper_print_user_data {
-    const whisper_params * params;
-
-    const std::vector<std::vector<float>> * pcmf32s;
-    int progress_prev;
-};
-
-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
-    std::string speaker = "";
-    const int64_t n_samples = pcmf32s[0].size();
-
-    const int64_t is0 = timestamp_to_sample(t0, n_samples);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-    double energy0 = 0.0f;
-    double energy1 = 0.0f;
-
-    for (int64_t j = is0; j < is1; j++) {
-        energy0 += fabs(pcmf32s[0][j]);
-        energy1 += fabs(pcmf32s[1][j]);
-    }
-
-    if (energy0 > 1.1*energy1) {
-        speaker = "0";
-    } else if (energy1 > 1.1*energy0) {
-        speaker = "1";
-    } else {
-        speaker = "?";
-    }
-
-    //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
-
-    if (!id_only) {
-        speaker.insert(0, "(speaker ");
-        speaker.append(")");
-    }
-
-    return speaker;
-}
-
-void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
-    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
-    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
-    if (progress >= *progress_prev + progress_step) {
-        *progress_prev += progress_step;
-        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
-    }
-}
-
-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
-    const auto & params  = *((whisper_print_user_data *) user_data)->params;
-    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-
-    std::string speaker = "";
-
-    int64_t t0 = 0;
-    int64_t t1 = 0;
-
-    // print the last n_new segments
-    const int s0 = n_segments - n_new;
-
-    if (s0 == 0) {
-        printf("\n");
-    }
-
-    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
-
-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-                }
-
-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
-            }
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);
-
-            printf("%s%s", speaker.c_str(), text);
-        }
-
-        if (params.tinydiarize) {
-            if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-                printf("%s", params.tdrz_speaker_turn.c_str());
-            }
-        }
-
-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
-        fflush(stdout);
-    }
-}
-
-std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::stringstream result;
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        result << speaker << text << "\n";
-    }
-    return result.str();
-}
-
-void get_req_parameters(const Request & req, whisper_params & params)
-{
-    // user model configu.has_fileion
-    if (req.has_file("offset-t"))
-    {
-        params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
-    }
-    if (req.has_file("offset-n"))
-    {
-        params.offset_n = std::stoi(req.get_file_value("offset-n").content);
-    }
-    if (req.has_file("duration"))
-    {
-        params.duration_ms = std::stoi(req.get_file_value("duration").content);
-    }
-    if (req.has_file("max-context"))
-    {
-        params.max_context = std::stoi(req.get_file_value("max-context").content);
-    }
-    if (req.has_file("prompt"))
-    {
-        params.prompt = req.get_file_value("prompt").content;
-    }
-    if (req.has_file("response-format"))
-    {
-        params.response_format = req.get_file_value("response-format").content;
-    }
-    if (req.has_file("temerature"))
-    {
-        params.userdef_temp = std::stof(req.get_file_value("temperature").content);
-    }
-}
-
-}  // namespace
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-    server_params sparams;
-
-    std::mutex whisper_mutex;
-
-    if (whisper_params_parse(argc, argv, params, sparams) == false) {
-        whisper_print_usage(argc, argv, params, sparams);
-        return 1;
-    }
-
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params, sparams);
-        exit(0);
-    }
-
-    if (params.diarize && params.tinydiarize) {
-        fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
-        whisper_print_usage(argc, argv, params, sparams);
-        exit(0);
-    }
-
-    // whisper init
-    struct whisper_context_params cparams;
-    cparams.use_gpu = params.use_gpu;
-
-    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-
-    if (ctx == nullptr) {
-        fprintf(stderr, "error: failed to initialize whisper context\n");
-        return 3;
-    }
-
-    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
-    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
-
-    Server svr;
-
-    std::string const default_content = "<html>hello</html>";
-
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/", [&default_content](const Request &, Response &res){
-        res.set_content(default_content, "text/html");
-        return false;
-    });
-
-    svr.Post("/inference", [&](const Request &req, Response &res){
-        // aquire whisper model mutex lock
-        whisper_mutex.lock();
-
-        // first check user requested fields of the request
-        if (!req.has_file("file"))
-        {
-            fprintf(stderr, "error: no 'file' field in the request\n");
-            const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
-            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
-            return;
-        }
-        auto audio_file = req.get_file_value("file");
-
-        // check non-required fields
-        get_req_parameters(req, params);
-
-        std::string filename{audio_file.filename};
-        printf("Received request: %s\n", filename.c_str());
-
-        // audio arrays
-        std::vector<float> pcmf32;               // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-        // write file to temporary file
-        std::ofstream temp_file{filename, std::ios::binary};
-        temp_file << audio_file.content;
-
-        // read wav content into pcmf32
-        if (!::read_wav(filename, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", filename.c_str());
-            const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
-            return;
-        }
-        // remove temp file
-        std::remove(filename.c_str());
-
-        printf("Successfully loaded %s\n", filename.c_str());
-
-        // print system information
-        {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-        }
-
-        // print some info about the processing
-        {
-            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
-            }
-            if (params.detect_language) {
-                params.language = "auto";
-            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
-                    __func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.tinydiarize ? "tdrz = 1, " : "",
-                    params.no_timestamps ? 0 : 1);
-
-            fprintf(stderr, "\n");
-        }
-
-        // run the inference
-        {
-
-            printf("Running whisper.cpp inference on %s\n", filename.c_str());
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.detect_language  = params.detect_language;
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-
-            wparams.thold_pt         = params.word_thold;
-            wparams.split_on_word    = params.split_on_word;
-
-            wparams.speed_up         = params.speed_up;
-            wparams.debug_mode       = params.debug_mode;
-
-            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
-
-            wparams.initial_prompt   = params.prompt.c_str();
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.temperature_inc  = params.userdef_temp;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-
-            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
-
-            // this callback is called on each new segment
-            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
-            }
-
-            if (wparams.print_progress) {
-                wparams.progress_callback           = whisper_print_progress_callback;
-                wparams.progress_callback_user_data = &user_data;
-            }
-
-            // examples for abort mechanism
-            // in examples below, we do not abort the processing, but we could if the flag is set to true
-
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            // the callback is called before every computation - if it returns true, the computation is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.abort_callback = [](void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return is_aborted;
-                };
-                wparams.abort_callback_user_data = &is_aborted;
-            }
-
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
-                res.set_content(error_resp, "application/json");
-                whisper_mutex.unlock();
-                return;
-            }
-        }
-
-        // return results to user
-        if (params.response_format == text_format)
-        {
-            std::string results = output_str(ctx, params, pcmf32s);
-            res.set_content(results.c_str(), "text/html");
-        }
-        // TODO add more output formats
-        else
-        {
-            std::string results = output_str(ctx, params, pcmf32s);
-            json jres = json{
-                {"text", results}
-            };
-            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
-                            "application/json");
-        }
-
-        // return whisper model mutex lock
-        whisper_mutex.unlock();
-    });
-    svr.Post("/load", [&](const Request &req, Response &res){
-        whisper_mutex.lock();
-        if (!req.has_file("model"))
-        {
-            fprintf(stderr, "error: no 'model' field in the request\n");
-            const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
-            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
-            return;
-        }
-        std::string model = req.get_file_value("model").content;
-        if (!is_file_exist(model.c_str()))
-        {
-            fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
-            const std::string error_resp = "{\"error\":\"model not found!\"}";
-            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
-            return;
-        }
-
-        // clean up
-        whisper_free(ctx);
-
-        // whisper init
-        ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
-
-        // TODO perhaps load prior model here instead of exit
-        if (ctx == nullptr) {
-            fprintf(stderr, "error: model init  failed, no model loaded must exit\n");
-            exit(1);
-        }
-
-        // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
-        whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
-
-        const std::string success = "Load was successful!";
-        res.set_content(success, "application/text");
-
-        // check if the model is in the file system
-        whisper_mutex.unlock();
-    });
-
-    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
-        const char fmt[] = "500 Internal Server Error\n%s";
-        char buf[BUFSIZ];
-        try {
-            std::rethrow_exception(std::move(ep));
-        } catch (std::exception &e) {
-            snprintf(buf, sizeof(buf), fmt, e.what());
-        } catch (...) {
-            snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-        }
-        res.set_content(buf, "text/plain");
-        res.status = 500;
-    });
-
-    svr.set_error_handler([](const Request &, Response &res) {
-        if (res.status == 400) {
-            res.set_content("Invalid request", "text/plain");
-        } else if (res.status != 500) {
-            res.set_content("File Not Found", "text/plain");
-            res.status = 404;
-        }
-    });
-
-    // set timeouts and change hostname and port
-    svr.set_read_timeout(sparams.read_timeout);
-    svr.set_write_timeout(sparams.write_timeout);
-
-    if (!svr.bind_to_port(sparams.hostname, sparams.port))
-    {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
-                sparams.hostname.c_str(), sparams.port);
-        return 1;
-    }
-
-    // Set the base directory for serving static files
-    svr.set_base_dir(sparams.public_path);
-
-    // to make it ctrl+clickable:
-    printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
-
-    if (!svr.listen_after_bind())
-    {
-        return 1;
-    }
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -446,14 +446,12 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
    return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
 }

-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
+static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
    ggml_tallocr_t alloc = node_tallocr(galloc, view);

    //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
-    if (update_backend) {
-        view->backend = view->view_src->backend;
-    }
+    view->backend = view->view_src->backend;
    view->buffer  = view->view_src->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;

@ -471,7 +469,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {

    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            init_view(galloc, node, true);
+            init_view(galloc, node);
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@ -501,14 +499,15 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                node->view_src = view_src;
                                view_src_hn->n_views += 1;
-                                init_view(galloc, node, false);
+                                init_view(galloc, node);
                                return;
                            }
-                        } else {
+                        }
+                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                            node->view_src = parent;
                            p_hn->n_views += 1;
-                            init_view(galloc, node, false);
+                            init_view(galloc, node);
                            return;
                        }
                    }
@ -538,7 +537,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            hash_get(galloc, view_src)->n_views += 1;
            if (node->buffer == NULL && node->data != NULL) {
                // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node, true);
+                init_view(galloc, node);
            }
        }

@ -549,7 +548,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            }
            hash_get(galloc, parent)->n_children += 1;
            if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent, true);
+                init_view(galloc, parent);
            }
        }
   }
@ -664,7 +663,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
    return max_size;
 }

-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
+void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
    const size_t hash_size = hash_set.size;

    GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@ -687,7 +686,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
    // reset hash values
    memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);

-    galloc->hash_allocs = hash_node_talloc;
+    galloc->hash_allocs = hash_node_alloct;

    ggml_tallocr_alloc_graph_impl(galloc, graph);

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6142,9 +6142,6 @@ inline void ggml_cuda_op_add(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {

-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne10 = src1->ne[0];
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -1368,12 +1368,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
    float max = x[0];
    float sum_w = weights[0];
    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
    for (int i = 1; i < n; ++i) {
-#endif
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
        float w = weights[i];
--- a/ggml.c
+++ b/ggml.c
@ -5024,13 +5024,8 @@ struct ggml_tensor * ggml_rope_back(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
-        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
@ -5047,15 +5042,11 @@ struct ggml_tensor * ggml_rope_back(

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
+    memcpy(params + 4, &freq_base,  sizeof(float));
+    memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params + 6, &xpos_base,  sizeof(float));
+    memcpy(params + 7, &xpos_down,  sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE_BACK;
@ -9385,6 +9376,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif

+
 static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -10954,8 +10946,7 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const bool forward) {
+        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11014,11 +11005,6 @@ static void ggml_compute_forward_rope_f32(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11035,9 +11021,9 @@ static void ggml_compute_forward_rope_f32(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
+                        const float sin_theta = sinf(theta_base);
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
+                        const float sin_block_theta = sinf(block_theta);

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11061,7 +11047,6 @@ static void ggml_compute_forward_rope_f32(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
-                        sin_theta *= sin_sign;

                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@ -11092,7 +11077,6 @@ static void ggml_compute_forward_rope_f32(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
-                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@ -11118,8 +11102,7 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const bool forward) {
+        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11171,11 +11154,6 @@ static void ggml_compute_forward_rope_f16(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11192,9 +11170,9 @@ static void ggml_compute_forward_rope_f16(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
+                        const float sin_theta = sinf(theta_base);
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
+                        const float sin_block_theta = sinf(block_theta);

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11218,7 +11196,6 @@ static void ggml_compute_forward_rope_f16(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
-                        sin_theta *= sin_sign;

                        theta_base *= theta_scale;

@ -11245,7 +11222,6 @@ static void ggml_compute_forward_rope_f16(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
-                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@ -11275,11 +11251,11 @@ static void ggml_compute_forward_rope(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst);
            } break;
        default:
            {
@ -11290,6 +11266,216 @@ static void ggml_compute_forward_rope(

 // ggml_compute_forward_rope_back

+static void ggml_compute_forward_rope_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // y = rope(x, src1)
+    // dx = rope_back(dy, src1)
+    // src0 is dy, src1 contains options
+
+    float freq_base;
+    float freq_scale;
+
+    // these two only relevant for xPos RoPE:
+    float xpos_base;
+    bool xpos_down;
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta_base = freq_scale * (float)p;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
+
+                        // zeta scaling for xPos only:
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
+                        if (xpos_down) zeta = 1.0f / zeta;
+
+                        theta_base *= theta_scale;
+
+                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float dy0 = dy[0];
+                        const float dy1 = dy[1];
+
+                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
+                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
+                    }
+                } else {
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
+
+                            theta_base *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float dy0 = dy[0];
+                            const float dy1 = dy[n_dims/2];
+
+                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
+                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope_back_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // y = rope(x, src1)
+    // dx = rope_back(dy, src1)
+    // src0 is dy, src1 contains options
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta_base = (float)p;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
+
+                        theta_base *= theta_scale;
+
+                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
+
+                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+                    }
+                } else {
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
+
+                            theta_base *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
+
+                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_rope_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -11298,11 +11484,11 @@ static void ggml_compute_forward_rope_back(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
+                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
+                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
            } break;
        default:
            {
@ -14737,20 +14923,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                // necessary for llama
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
+                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    float freq_base;
+                    float freq_scale;
+                    float xpos_base;
+                    bool  xpos_down;
+                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -14760,13 +14943,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                n_dims,
                                mode,
                                n_ctx,
-                                n_orig_ctx,
                                freq_base,
                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
                                xpos_base,
                                xpos_down),
                            zero_table);
@ -14776,20 +14954,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
+                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    float freq_base;
+                    float freq_scale;
+                    float xpos_base;
+                    bool  xpos_down;
+                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -14798,14 +14973,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                src1,
                                n_dims,
                                mode,
+                                0,
                                n_ctx,
-                                n_orig_ctx,
                                freq_base,
                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
+                                0.0f,
+                                1.0f,
+                                0.0f,
+                                0.0f,
                                xpos_base,
                                xpos_down,
                                false),
@ -18073,7 +18248,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    {
        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));

-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -18120,7 +18295,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                            case GGUF_TYPE_STRING:
                                {
                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
-                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                    }
                                } break;
@ -18148,7 +18323,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    {
        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));

-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@ -18195,7 +18370,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // compute the total size of the data section, taking into account the alignment
    {
        ctx->size = 0;
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            const int64_t ne =
@ -18264,7 +18439,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        ggml_set_no_alloc(ctx_data, true);

        // create the tensors
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
                ctx->infos[i].ne[0],
                ctx->infos[i].ne[1],
--- a/ggml.h
+++ b/ggml.h
@ -1371,13 +1371,8 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
-            int                   n_orig_ctx,
            float                 freq_base,
            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow,
            float                 xpos_base,
            bool                  xpos_down);

--- a/whisper.cpp
+++ b/whisper.cpp
@ -850,8 +850,9 @@ struct whisper_context {
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;

-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
-    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
+    ggml_type wtype_e = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Encoder
+    ggml_type wtype_d = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Decoder
+    ggml_type itype   = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)

    whisper_context_params params;

@ -1168,8 +1169,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
        // in order to save memory and also to speed up the computation
-        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-        if (wctx.wtype == GGML_TYPE_COUNT) {
+        wctx.wtype_e = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+        if (wctx.wtype_e == GGML_TYPE_COUNT) {
            WHISPER_LOG_ERROR("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
            return false;
        }
@ -1290,8 +1291,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        WHISPER_LOG_INFO("%s: n_langs       = %d\n", __func__, vocab.num_languages());
    }

-    const ggml_type wtype = wctx.wtype;
-    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
+    const ggml_type wtype_e = wctx.wtype_e;
+    const ggml_type wtype_d = wctx.wtype_d;
+    const ggml_type vtype   = wctx.wtype_e == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type

    // create the ggml context
    {
@ -1367,24 +1369,24 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                layer.mlp_ln_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
                layer.mlp_ln_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

-                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
+                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype_e,         n_audio_state, 4*n_audio_state);
                layer.mlp_0_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);

-                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
+                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype_e,       4*n_audio_state, n_audio_state);
                layer.mlp_1_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

-                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype_e,         n_audio_state, n_audio_state);
                layer.attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

-                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype_e,         n_audio_state, n_audio_state);

-                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype_e,         n_audio_state, n_audio_state);
                layer.attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

-                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_e,         n_audio_state, n_audio_state);
                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);

                // map by name
@ -1417,7 +1419,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        {
            model.d_pe   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);

-            model.d_te   = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
+            model.d_te   = ggml_new_tensor_2d(ctx, wtype_d,       n_text_state, n_vocab);

            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
@ -1436,38 +1438,38 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                layer.mlp_ln_w          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
                layer.mlp_ln_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
+                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, 4*n_text_state);
                layer.mlp_0_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);

-                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
+                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype_d,       4*n_text_state, n_text_state);
                layer.mlp_1_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

                layer.attn_ln_0_w       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
                layer.attn_ln_0_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.attn_q_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);

-                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.attn_v_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.attn_ln_1_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.cross_attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);

-                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.cross_attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

-                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_d,         n_text_state, n_text_state);
                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);

                // map by name
@ -1732,20 +1734,22 @@ static struct ggml_cgraph * whisper_build_graph_conv(
        // convolution + gelu
        {
            cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
-            if (n_ctx == hparams.n_audio_ctx) {
-                cur = ggml_add(ctx0, cur, model.e_conv_1_b);
-            } else {
-                cur = ggml_add(ctx0, cur, ggml_cont(ctx0, ggml_view_2d(ctx0, model.e_conv_1_b, cur->ne[0], cur->ne[1], model.e_conv_1_b->nb[1], 0)));
-            }
+            cur = ggml_add(ctx0, cur, model.e_conv_1_b);
+            //cur = ggml_add(ctx0,
+            //        ggml_repeat(ctx0,
+            //            model.e_conv_1_b,
+            //            cur),
+            //        cur);

            cur = ggml_gelu(ctx0, cur);

            cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
-            if (n_ctx == hparams.n_audio_ctx) {
-                cur = ggml_add(ctx0, cur, model.e_conv_2_b);
-            } else {
-                cur = ggml_add(ctx0, cur, ggml_cont(ctx0, ggml_view_2d(ctx0, model.e_conv_2_b, cur->ne[0], cur->ne[1], model.e_conv_2_b->nb[1], 0)));
-            }
+            cur = ggml_add(ctx0, cur, model.e_conv_2_b);
+            //cur = ggml_add(ctx0,
+            //        ggml_repeat(ctx0,
+            //            model.e_conv_2_b,
+            //            cur),
+            //        cur);

            cur = ggml_gelu(ctx0, cur);
        }
@ -3525,7 +3529,7 @@ int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
 int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
    whisper_batch_prep_legacy(state->batch, tokens, n_tokens, n_past, 0);

-    whisper_kv_cache_seq_rm(state->kv_self, 0, n_past, -1);
+    whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);

    if (!whisper_decode_internal(*ctx, *state, state->batch, n_threads, nullptr, nullptr)) {
        WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
@ -3538,10 +3542,19 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
 int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
    if (ctx->state == nullptr) {
        WHISPER_LOG_ERROR("%s: ERROR state was not loaded.\n", __func__);
-        return -1;
+        return false;
    }

-    return whisper_decode_with_state(ctx, ctx->state, tokens, n_tokens, n_past, n_threads);
+    whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);
+
+    whisper_batch_prep_legacy(ctx->state->batch, tokens, n_tokens, n_past, 0);
+
+    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->batch, n_threads, nullptr, nullptr)) {
+        WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
+        return 1;
+    }
+
+    return 0;
 }

 int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
@ -5177,7 +5190,7 @@ int whisper_full_with_state(
            const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);

            params.progress_callback(
-                ctx, state, progress_cur, params.progress_callback_user_data);
+                ctx, ctx->state, progress_cur, params.progress_callback_user_data);
        }

        // of only 1 second left, then stop
@ -6064,43 +6077,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
    // 1GB array
    const size_t size = arr*1e6;

-    double sum  = 0.0;
-
-    // heat-up
-    {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-
-        for (size_t i = 0; i < n; i++) {
-            const int64_t t0 = ggml_time_us();
-
-            memcpy(dst, src, size);
-
-            const int64_t t1 = ggml_time_us();
-
-            tsum += (t1 - t0)*1e-6;
-
-            src[rand() % size] = rand() % 256;
-        }
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-        }
-
-        free(src);
-        free(dst);
-    }
-
    // single-thread
    {
        char * src = (char *) malloc(size);
@ -6111,6 +6087,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
        memcpy(dst, src, size); // heat-up

        double tsum = 0.0;
+        double sum  = 0.0;

        for (size_t i = 0; i < n; i++) {
            const int64_t t0 = ggml_time_us();
@ -6124,73 +6101,21 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
            src[rand() % size] = rand() % 256;
        }

-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
+        snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
        s += strbuf;

        // needed to prevent the compiler from optimizing the memcpy away
        {
            for (size_t i = 0; i < size; i++) sum += dst[i];
+
+            snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
+            s += strbuf;
        }

        free(src);
        free(dst);
    }

-    // multi-thread
-
-    for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-
-        auto helper = [&](int th) {
-            const int64_t i0 = (th + 0)*size/n_threads;
-            const int64_t i1 = (th + 1)*size/n_threads;
-
-            for (size_t i = 0; i < n; i++) {
-                memcpy(dst + i0, src + i0, i1 - i0);
-
-                src[i0 + rand() % (i1 - i0)] = rand() % 256;
-            };
-        };
-
-        const int64_t t0 = ggml_time_us();
-
-        std::vector<std::thread> threads(n_threads - 1);
-        for (uint32_t th = 0; th < n_threads - 1; ++th) {
-            threads[th] = std::thread(helper, th);
-        }
-
-        helper(n_threads - 1);
-
-        for (uint32_t th = 0; th < n_threads - 1; ++th) {
-            threads[th].join();
-        }
-
-        const int64_t t1 = ggml_time_us();
-
-        tsum += (t1 - t0)*1e-6;
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-        }
-
-        free(src);
-        free(dst);
-    }
-
-    snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
-    s += strbuf;
-
    return s.c_str();
 }

--- a/whisper.h
+++ b/whisper.h
@ -50,9 +50,7 @@ extern "C" {
    //
    //     ...
    //
-    //     whisper_context_params cparams = whisper_context_default_params();
-    // 
-    //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
+    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
    //
    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
    //         fprintf(stderr, "failed to process audio\n");