fixed thinking tags extraction

This commit is contained in:
Saifeddine ALOUI 2025-02-07 21:57:40 +01:00
parent 7bb21b6c76
commit 998bdd4015

View File

@ -1144,9 +1144,10 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory.
"formatted_string": formatted_string "formatted_string": formatted_string
} }
def extract_thinking_blocks(self, text: str) -> List[str]: def extract_thinking_blocks(self, text: str) -> List[str]:
""" """
Extracts content between <thinking> tags from a given text. Extracts content between <thinking> or <think> tags from a given text.
Parameters: Parameters:
text (str): The text containing thinking blocks text (str): The text containing thinking blocks
@ -1156,19 +1157,18 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory.
""" """
import re import re
# Find all matches between thinking tags # Pattern to match both <thinking> and <think> blocks with matching tags
pattern = r'<thinking>(.*?)</thinking>' pattern = r'<(thinking|think)>(.*?)</\1>'
# re.DOTALL allows . to match newlines
matches = re.finditer(pattern, text, re.DOTALL) matches = re.finditer(pattern, text, re.DOTALL)
# Extract and clean the content # Extract content from the second group (index 2) and clean
thinking_blocks = [match.group(1).strip() for match in matches] thinking_blocks = [match.group(2).strip() for match in matches]
return thinking_blocks return thinking_blocks
def remove_thinking_blocks(self, text: str) -> str: def remove_thinking_blocks(self, text: str) -> str:
""" """
Removes thinking blocks from text including the tags. Removes thinking blocks (either <thinking> or <think>) from text including the tags.
Parameters: Parameters:
text (str): The text containing thinking blocks text (str): The text containing thinking blocks
@ -1178,8 +1178,8 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory.
""" """
import re import re
# Replace thinking blocks with empty string # Pattern to remove both <thinking> and <think> blocks with matching tags
pattern = r'<thinking>.*?</thinking>' pattern = r'<(thinking|think)>.*?</\1>'
cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL) cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
# Remove extra whitespace and normalize newlines # Remove extra whitespace and normalize newlines
@ -1187,7 +1187,6 @@ Don't forget encapsulate the code inside a html code tag. This is mandatory.
return cleaned_text return cleaned_text
def extract_code_blocks(self, text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]: def extract_code_blocks(self, text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]:
""" """
This function extracts code blocks from a given text and optionally returns the text without code blocks. This function extracts code blocks from a given text and optionally returns the text without code blocks.