fix(utf8): prevent multi-byte utf8 characters from being mangled (#981)

mirror of https://github.com/mudler/LocalAI.git synced 2025-05-02 16:53:17 +00:00

**Description**

This PR fixes #677 using [suggested
solution](https://github.com/go-skynet/LocalAI/issues/677#issuecomment-1695939097)
from @yantoz

before:
```
❯ curl -N http://localhost:57541/v1/completions -H "Content-Type: application/json" -d '{
     "model": "ggml-model-q4_0.bin",
     "prompt": "",
     "max_tokens": 32,
     "temperature": 0.7,
     "stream": true
   }'
data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"\ufffd"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" |"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":" I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
```

now:
```
❯ curl -N http://localhost:57541/v1/completions -H Content-Type: application/json -d {
   "model": "ggml-model-q4_0.bin",
   "prompt": "",
   "max_tokens": 32,
   "temperature": 0.7,
   "stream": true
 }
data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"😂"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"|"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":" "}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"I"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"'"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}

data: {"object":"text_completion","model":"ggml-model-q4_0.bin","choices":[{"index":0,"text":"m"}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
```

**Notes for Reviewers**


**[Signed
commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
- [X] Yes, I signed my commits.
 

<!--
Thank you for contributing to LocalAI! 

Contributing Conventions:

1. Include descriptive PR titles with [<component-name>] prepended.
2. Build and test your changes before submitting a PR. 
3. Sign your commits

By following the community's contribution conventions upfront, the
review process will
be accelerated and your PR merged more quickly.
-->

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

This commit is contained in:

Samuel Maynard

2023-08-31 01:56:59 +02:00

committed by

GitHub

parent b905c07650

commit deeef5fc24

No known key found for this signature in database

GPG Key ID: 4AEE18F83AFDEB23

1 changed files with 18 additions and 3 deletions

									
										21

api/backend/llm.go
									
											View File
											
				@ -6,6 +6,7 @@ import (

					"regexp"

					"strings"

					"sync"

					"unicode/utf8"

					config "github.com/go-skynet/LocalAI/api/config"

					"github.com/go-skynet/LocalAI/api/options"

				@ -97,9 +98,23 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c

						if tokenCallback != nil {

							ss := ""

							err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {

								tokenCallback(string(s), tokenUsage)

								ss += string(s)

							var partialRune []byte

							err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {

								partialRune = append(partialRune, chars...)

								for len(partialRune) > 0 {

									r, size := utf8.DecodeRune(partialRune)

									if r == utf8.RuneError {

										// incomplete rune, wait for more bytes

										break

									}

									tokenCallback(string(r), tokenUsage)

									ss += string(r)

									partialRune = partialRune[size:]

								}

							})

							return LLMResponse{

								Response: ss,

fix(utf8): prevent multi-byte utf8 characters from being mangled (#981)

21 api/backend/llm.go Unescape Escape View File

21

api/backend/llm.go

View File