From c224bcb35da386db90bce30f5f25e2cb060d2b4e Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sun, 15 Dec 2024 21:02:19 +0100 Subject: [PATCH] Enhanced code extraction --- lollms/personality.py | 198 +++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 127 deletions(-) diff --git a/lollms/personality.py b/lollms/personality.py index 5b7255b..e5d3541 100644 --- a/lollms/personality.py +++ b/lollms/personality.py @@ -4374,138 +4374,82 @@ transition-all duration-300 ease-in-out"> return updated_content, True # Section updated successfully - def extract_code_blocks(self, text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]: - """ - This function extracts code blocks from a given text and optionally returns the text without code blocks. - - Parameters: - text (str): The text from which to extract code blocks. Code blocks are identified by triple backticks (```). - return_remaining_text (bool): If True, also returns the text with code blocks removed. - - Returns: - Union[List[dict], Tuple[List[dict], str]]: - - If return_remaining_text is False: Returns only the list of code block dictionaries - - If return_remaining_text is True: Returns a tuple containing: - * List of code block dictionaries - * String containing the text with all code blocks removed - - Each code block dictionary contains: - - 'index' (int): The index of the code block in the text - - 'file_name' (str): The name of the file extracted from the preceding line, if available - - 'content' (str): The content of the code block - - 'type' (str): The type of the code block - - 'is_complete' (bool): True if the block has a closing tag, False otherwise - """ - remaining = text - bloc_index = 0 - first_index = 0 - indices = [] - text_without_blocks = text + def extract_code_blocks(text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]: + codes = [] + remaining_text = text + current_index = 0 - # Find all code block delimiters - while len(remaining) > 0: - try: - index = remaining.index("```") - indices.append(index + first_index) - remaining = remaining[index + 3:] - first_index += index + 3 - bloc_index += 1 - except Exception as ex: - if bloc_index % 2 == 1: - index = index+len(remaining) - indices.append(index) - remaining = "" - - code_blocks = [] - is_start = True - - # Process code blocks and build text without blocks if requested - if return_remaining_text: - text_parts = [] - last_end = 0 + while True: + # Find next code block start + start_pos = remaining_text.find('```') + if start_pos == -1: + break + + # Check for file name before code block + file_name = '' + file_name_match = remaining_text[:start_pos].rfind('') + if file_name_match != -1: + file_name_end = remaining_text[:start_pos].rfind('') + if file_name_end != -1 and file_name_match < file_name_end: + file_name = remaining_text[file_name_match + 11:file_name_end].strip() - for index, code_delimiter_position in enumerate(indices): - if is_start: - block_infos = { - 'index': len(code_blocks), - 'file_name': "", - 'section': "", - 'content': "", - 'type': "", - 'is_complete': False - } - - # Store text before code block if returning remaining text - if return_remaining_text: - text_parts.append(text[last_end:code_delimiter_position].strip()) - - # Check the preceding line for file name - preceding_text = text[:code_delimiter_position].strip().splitlines() - if preceding_text: - last_line = preceding_text[-1].strip() - if last_line.startswith("") and last_line.endswith(""): - file_name = last_line[len(""):-len("")].strip() - block_infos['file_name'] = file_name - elif last_line.startswith("## filename:"): - file_name = last_line[len("## filename:"):].strip() - block_infos['file_name'] = file_name - if last_line.startswith("
") and last_line.endswith("
"): - section = last_line[len("
"):-len("
")].strip() - block_infos['section'] = section - - sub_text = text[code_delimiter_position + 3:] - if len(sub_text) > 0: - try: - find_space = sub_text.index(" ") - except: - find_space = int(1e10) - try: - find_return = sub_text.index("\n") - except: - find_return = int(1e10) - next_index = min(find_return, find_space) - if '{' in sub_text[:next_index]: - next_index = 0 - start_pos = next_index - - if code_delimiter_position + 3 < len(text) and text[code_delimiter_position + 3] in ["\n", " ", "\t"]: - block_infos["type"] = 'language-specific' - else: - block_infos["type"] = sub_text[:next_index] - - if index + 1 < len(indices): - next_pos = indices[index + 1] - code_delimiter_position - if next_pos - 3>0: - if next_pos - 3 < len(sub_text) and sub_text[next_pos - 3] == "`": - block_infos["content"] = sub_text[start_pos:next_pos - 3].strip() - block_infos["is_complete"] = True - else: - block_infos["content"] = sub_text[start_pos:next_pos].strip() - block_infos["is_complete"] = False - - if return_remaining_text: - last_end = indices[index + 1] + 3 - else: - block_infos["content"] = sub_text[start_pos:].strip() - block_infos["is_complete"] = False - - if return_remaining_text: - last_end = len(text) - - code_blocks.append(block_infos) - is_start = False + # Get code type if specified + code_type = '' + next_newline = remaining_text.find('\n', start_pos + 3) + if next_newline != -1: + potential_type = remaining_text[start_pos + 3:next_newline].strip() + if potential_type: + code_type = potential_type + start_pos = next_newline + 1 + else: + start_pos += 3 else: - is_start = True + start_pos += 3 - if return_remaining_text: - # Add any remaining text after the last code block - if last_end < len(text): - text_parts.append(text[last_end:].strip()) - # Join all non-code parts with newlines - text_without_blocks = '\n'.join(filter(None, text_parts)) - return code_blocks, text_without_blocks + # Find matching end tag + tag_count = 1 + pos = start_pos + content_start = start_pos + is_complete = False - return code_blocks + while pos < len(remaining_text): + if remaining_text[pos:pos + 3] == '```': + tag_count -= 1 + if tag_count == 0: + # Found matching end tag + content = remaining_text[content_start:pos].strip() + is_complete = True + codes.append({ + 'index': current_index, + 'file_name': file_name, + 'content': content, + 'type': code_type, + 'is_complete': True + }) + remaining_text = remaining_text[pos + 3:] + break + elif remaining_text[pos:pos + 3] == '```': + tag_count += 1 + pos += 1 + + if not is_complete: + # Handle incomplete code block + content = remaining_text[content_start:].strip() + codes.append({ + 'index': current_index, + 'file_name': file_name, + 'content': content, + 'type': code_type, + 'is_complete': False + }) + remaining_text = '' + + current_index += 1 + + if return_remaining_text: + return codes, remaining_text + return codes + def build_and_execute_python_code(self,context, instructions, execution_function_signature, extra_imports=""):