From baa30bacdb214ddbd62a308769a991a23c933eec Mon Sep 17 00:00:00 2001 From: JacobLinCool Date: Tue, 30 Jan 2024 20:15:55 +0800 Subject: [PATCH] server : add fields to `verbose_json` response (#1802) * server: include additional fields in the verbose_json response as OpenAI does * server: show request examples on home page * server: todo note for compression_ratio and no_speech_prob * server: add simple demo form to the homepage --- examples/server/server.cpp | 89 +++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7de31859..69c04bf3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -543,7 +543,76 @@ int main(int argc, char ** argv) { {"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Headers", "content-type"}}); - std::string const default_content = "hello"; + std::string const default_content = R"( + + + Whisper.cpp Server + + + + + +

Whisper.cpp Server

+ +

/inference

+
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
+    -H "Content-Type: multipart/form-data" \
+    -F file="@<file-path>" \
+    -F temperature="0.0" \
+    -F temperature_inc="0.2" \
+    -F response_format="json"
+        
+ +

/load

+
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
+    -H "Content-Type: multipart/form-data" \
+    -F model="<path-to-model-file>"
+        
+ +
+

Try it out

+
+ +
+ + +
+ + +
+ + +
+
+ + + )"; // store default params so we can reset after each inference request whisper_params default_params = params; @@ -787,7 +856,13 @@ int main(int argc, char ** argv) { } else if (params.response_format == vjson_format) { /* try to match openai/whisper's Python format */ std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; + json jres = json{ + {"task", params.translate ? "translate" : "transcribe"}, + {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))}, + {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE}, + {"text", results}, + {"segments", json::array()} + }; const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { @@ -801,6 +876,7 @@ int main(int argc, char ** argv) { segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01; } + float total_logprob = 0; const int n_tokens = whisper_full_n_tokens(ctx, i); for (int j = 0; j < n_tokens; ++j) { whisper_token_data token = whisper_full_get_token_data(ctx, i, j); @@ -815,8 +891,17 @@ int main(int argc, char ** argv) { word["end"] = token.t1 * 0.01; } word["probability"] = token.p; + total_logprob += token.plog; segment["words"].push_back(word); } + + segment["temperature"] = params.temperature; + segment["avg_logprob"] = total_logprob / n_tokens; + + // TODO compression_ratio and no_speech_prob are not implemented yet + // segment["compression_ratio"] = 0; + // segment["no_speech_prob"] = 0; + jres["segments"].push_back(segment); } res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),