whisper.cpp/examples/common.cpp

#include "common.h"

// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"

#include <cmath>
#include <regex>

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
}

std::string replace(const std::string & s, const std::string & from, const std::string & to) {
    std::string result = s;
    size_t pos = 0;
    while ((pos = result.find(from, pos)) != std::string::npos) {
        result.replace(pos, from.length(), to);
        pos += to.length();
    }
    return result;
}

bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin

    if (fname == "-") {
        {
            uint8_t buf[1024];
            while (true)
            {
                const size_t n = fread(buf, 1, sizeof(buf), stdin);
                if (n == 0) {
                    break;
                }
                wav_data.insert(wav_data.end(), buf, buf + n);
            }
        }

        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
            fprintf(stderr, "error: failed to open WAV file from stdin\n");
            return false;
        }

        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
    }

    if (wav.channels != 1 && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
        return false;
    }

    if (stereo && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
        return false;
    }

    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
        return false;
    }

    if (wav.bitsPerSample != 16) {
        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
        return false;
    }

    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);

    std::vector<int16_t> pcm16;
    pcm16.resize(n*wav.channels);
    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
    drwav_uninit(&wav);

    // convert to mono, float
    pcmf32.resize(n);
    if (wav.channels == 1) {
        for (uint64_t i = 0; i < n; i++) {
            pcmf32[i] = float(pcm16[i])/32768.0f;
        }
    } else {
        for (uint64_t i = 0; i < n; i++) {
            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
        }
    }

    if (stereo) {
        // convert to stereo, float
        pcmf32s.resize(2);

        pcmf32s[0].resize(n);
        pcmf32s[1].resize(n);
        for (uint64_t i = 0; i < n; i++) {
            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
        }
    }

    return true;
}

void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);

    float y = data[0];

    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
}

bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;

    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }

    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }

    float energy_all  = 0.0f;
    float energy_last = 0.0f;

    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);

        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }

    energy_all  /= n_samples;
    energy_last /= n_samples_last;

    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }

    if (energy_last > vad_thold*energy_all) {
        return false;
    }

    return true;
}
examples : refactor in order to reuse code and reduce duplication (#482) * examples : refactor common code into a library * examples : refactor common SDL code into a library * make : update Makefile to use common libs * common : fix MSVC M_PI .. * addon.node : link common lib 2023-02-15 17:28:10 +00:00			`#include "common.h"`

			`// third-party utilities`
			`// use your favorite implementations`
			`#define DR_WAV_IMPLEMENTATION`
			`#include "dr_wav.h"`

			`#include <cmath>`
			`#include <regex>`

			`#ifndef M_PI`
			`#define M_PI 3.14159265358979323846`
			`#endif`

			`std::string trim(const std::string & s) {`
			`std::regex e("^\\s+\|\\s+$");`
			`return std::regex_replace(s, e, "");`
			`}`

			`std::string replace(const std::string & s, const std::string & from, const std::string & to) {`
			`std::string result = s;`
			`size_t pos = 0;`
			`while ((pos = result.find(from, pos)) != std::string::npos) {`
			`result.replace(pos, from.length(), to);`
			`pos += to.length();`
			`}`
			`return result;`
			`}`

			`bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {`
			`drwav wav;`
			`std::vector<uint8_t> wav_data; // used for pipe input from stdin`

			`if (fname == "-") {`
			`{`
			`uint8_t buf[1024];`
			`while (true)`
			`{`
			`const size_t n = fread(buf, 1, sizeof(buf), stdin);`
			`if (n == 0) {`
			`break;`
			`}`
			`wav_data.insert(wav_data.end(), buf, buf + n);`
			`}`
			`}`

			`if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {`
			`fprintf(stderr, "error: failed to open WAV file from stdin\n");`
			`return false;`
			`}`

			`fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());`
			`}`
			`else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {`
			`fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());`
			`return false;`
			`}`

			`if (wav.channels != 1 && wav.channels != 2) {`
			`fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());`
			`return false;`
			`}`

			`if (stereo && wav.channels != 2) {`
			`fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());`
			`return false;`
			`}`

			`if (wav.sampleRate != COMMON_SAMPLE_RATE) {`
			`fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);`
			`return false;`
			`}`

			`if (wav.bitsPerSample != 16) {`
			`fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());`
			`return false;`
			`}`

			`const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);`

			`std::vector<int16_t> pcm16;`
			`pcm16.resize(n*wav.channels);`
			`drwav_read_pcm_frames_s16(&wav, n, pcm16.data());`
			`drwav_uninit(&wav);`

			`// convert to mono, float`
			`pcmf32.resize(n);`
			`if (wav.channels == 1) {`
			`for (uint64_t i = 0; i < n; i++) {`
			`pcmf32[i] = float(pcm16[i])/32768.0f;`
			`}`
			`} else {`
			`for (uint64_t i = 0; i < n; i++) {`
			`pcmf32[i] = float(pcm16[2i] + pcm16[2i + 1])/65536.0f;`
			`}`
			`}`

			`if (stereo) {`
			`// convert to stereo, float`
			`pcmf32s.resize(2);`

			`pcmf32s[0].resize(n);`
			`pcmf32s[1].resize(n);`
			`for (uint64_t i = 0; i < n; i++) {`
			`pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;`
			`pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;`
			`}`
			`}`

			`return true;`
			`}`

			`void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {`
			`const float rc = 1.0f / (2.0f * M_PI * cutoff);`
			`const float dt = 1.0f / sample_rate;`
			`const float alpha = dt / (rc + dt);`

			`float y = data[0];`

			`for (size_t i = 1; i < data.size(); i++) {`
			`y = alpha * (y + data[i] - data[i - 1]);`
			`data[i] = y;`
			`}`
			`}`

			`bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {`
			`const int n_samples = pcmf32.size();`
			`const int n_samples_last = (sample_rate * last_ms) / 1000;`

			`if (n_samples_last >= n_samples) {`
			`// not enough samples - assume no speech`
			`return false;`
			`}`

			`if (freq_thold > 0.0f) {`
			`high_pass_filter(pcmf32, freq_thold, sample_rate);`
			`}`

			`float energy_all = 0.0f;`
			`float energy_last = 0.0f;`

			`for (int i = 0; i < n_samples; i++) {`
			`energy_all += fabsf(pcmf32[i]);`

			`if (i >= n_samples - n_samples_last) {`
			`energy_last += fabsf(pcmf32[i]);`
			`}`
			`}`

			`energy_all /= n_samples;`
			`energy_last /= n_samples_last;`

			`if (verbose) {`
			`fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);`
			`}`

			`if (energy_last > vad_thold*energy_all) {`
			`return false;`
			`}`

			`return true;`
			`}`