From 998bdd4015ab3e3d206bced041a7112b6e35845f Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 7 Feb 2025 21:57:40 +0100 Subject: [PATCH] fixed thinking tags extraction --- lollms/personality.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lollms/personality.py b/lollms/personality.py index 7412ade..71e1963 100644 --- a/lollms/personality.py +++ b/lollms/personality.py @@ -1144,9 +1144,10 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory. "formatted_string": formatted_string } + def extract_thinking_blocks(self, text: str) -> List[str]: """ - Extracts content between tags from a given text. + Extracts content between or tags from a given text. Parameters: text (str): The text containing thinking blocks @@ -1156,19 +1157,18 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory. """ import re - # Find all matches between thinking tags - pattern = r'(.*?)' - # re.DOTALL allows . to match newlines + # Pattern to match both and blocks with matching tags + pattern = r'<(thinking|think)>(.*?)' matches = re.finditer(pattern, text, re.DOTALL) - # Extract and clean the content - thinking_blocks = [match.group(1).strip() for match in matches] + # Extract content from the second group (index 2) and clean + thinking_blocks = [match.group(2).strip() for match in matches] return thinking_blocks def remove_thinking_blocks(self, text: str) -> str: """ - Removes thinking blocks from text including the tags. + Removes thinking blocks (either or ) from text including the tags. Parameters: text (str): The text containing thinking blocks @@ -1178,15 +1178,14 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory. """ import re - # Replace thinking blocks with empty string - pattern = r'.*?' + # Pattern to remove both and blocks with matching tags + pattern = r'<(thinking|think)>.*?' cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL) # Remove extra whitespace and normalize newlines cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text.strip()) - return cleaned_text - + return cleaned_text def extract_code_blocks(self, text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]: """