From 23c21e92ebebd862448ec5c175b8d3abc20ec6fa Mon Sep 17 00:00:00 2001 From: Ismatulla Mansurov <47342870+sapoepsilon@users.noreply.github.com> Date: Mon, 27 Nov 2023 02:28:34 -0700 Subject: [PATCH] server : automatically convert audio on the server (#1539) * server : automatically convert audio on the server * server : remove rebundant comments * server : automatic conversion refactor * server : update server readme * server : remove unnecessary comments and tabs * server : put back remove calling * server : apply suggestions from code review Co-authored-by: Georgi Gerganov * server : check ffmpeg before the server lunch * server : fix indentation * Apply suggestions from code review Co-authored-by: Georgi Gerganov * server : fix function typo calling * server : fix function typo calling * server : add warning in readme --------- Co-authored-by: Georgi Gerganov --- examples/server/README.md | 4 +++ examples/server/server.cpp | 59 +++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 2a43541f..84b85e87 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -43,8 +43,12 @@ options: -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference --host HOST, [127.0.0.1] Hostname/ip-adress for the server --port PORT, [8080 ] Port number for the server + --convert, [false ] Convert audio to WAV, requires ffmpeg on the server ``` +> [!WARNING] +> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.** + ## request examples **/inference** diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ae0b7a26..dc5317a0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,6 +43,8 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + + bool ffmpeg_converter = false; }; struct whisper_params { @@ -157,6 +159,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port); fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str()); + fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false"); fprintf(stderr, "\n"); } @@ -203,6 +206,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); } else if ( arg == "--host") { sparams.hostname = argv[++i]; } else if ( arg == "--public") { sparams.public_path = argv[++i]; } + else if ( arg == "--convert") { sparams.ffmpeg_converter = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params, sparams); @@ -220,6 +224,45 @@ struct whisper_print_user_data { int progress_prev; }; +void check_ffmpeg_availibility() { + int result = system("ffmpeg -version"); + + if (result == 0) { + std::cout << "ffmpeg is available." << std::endl; + } else { + // ffmpeg is not available + std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "; + std::cout << "and that its executable is included in your system's PATH. "; + exit(0); + } +} + +bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) { + std::ostringstream cmd_stream; + std::string converted_filename_temp = temp_filename + "_temp.wav"; + cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; + std::string cmd = cmd_stream.str(); + + int status = std::system(cmd.c_str()); + if (status != 0) { + error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; + return false; + } + + // Remove the original file + if (remove(temp_filename.c_str()) != 0) { + error_resp = "{\"error\":\"Failed to remove the original file.\"}"; + return false; + } + + // Rename the temporary file to match the original filename + if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { + error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; + return false; + } + return true; +} + std::string estimate_diarization_speaker(std::vector> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) { std::string speaker = ""; const int64_t n_samples = pcmf32s[0].size(); @@ -407,6 +450,9 @@ int main(int argc, char ** argv) { exit(0); } + if (sparams.ffmpeg_converter) { + check_ffmpeg_availibility(); + } // whisper init struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; @@ -462,6 +508,18 @@ int main(int argc, char ** argv) { temp_file << audio_file.content; temp_file.close(); + // if file is not wav, convert to wav + + if (sparams.ffmpeg_converter) { + std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}"; + const bool is_converted = convert_to_wav(temp_filename, error_resp); + if (!is_converted) { + res.set_content(error_resp, "application/json"); + whisper_mutex.unlock(); + return; + } + } + // read wav content into pcmf32 if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) { fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str()); @@ -509,7 +567,6 @@ int main(int argc, char ** argv) { // run the inference { - printf("Running whisper.cpp inference on %s\n", filename.c_str()); whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);