Enhanced code extraction

This commit is contained in:
Saifeddine ALOUI 2024-12-15 21:02:19 +01:00
parent f855262848
commit c224bcb35d

View File

@ -4374,138 +4374,82 @@ transition-all duration-300 ease-in-out">
return updated_content, True # Section updated successfully return updated_content, True # Section updated successfully
def extract_code_blocks(self, text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]: def extract_code_blocks(text: str, return_remaining_text: bool = False) -> Union[List[dict], Tuple[List[dict], str]]:
""" codes = []
This function extracts code blocks from a given text and optionally returns the text without code blocks. remaining_text = text
current_index = 0
Parameters: while True:
text (str): The text from which to extract code blocks. Code blocks are identified by triple backticks (```). # Find next code block start
return_remaining_text (bool): If True, also returns the text with code blocks removed. start_pos = remaining_text.find('```')
if start_pos == -1:
break
Returns: # Check for file name before code block
Union[List[dict], Tuple[List[dict], str]]: file_name = ''
- If return_remaining_text is False: Returns only the list of code block dictionaries file_name_match = remaining_text[:start_pos].rfind('<file_name>')
- If return_remaining_text is True: Returns a tuple containing: if file_name_match != -1:
* List of code block dictionaries file_name_end = remaining_text[:start_pos].rfind('</file_name>')
* String containing the text with all code blocks removed if file_name_end != -1 and file_name_match < file_name_end:
file_name = remaining_text[file_name_match + 11:file_name_end].strip()
Each code block dictionary contains: # Get code type if specified
- 'index' (int): The index of the code block in the text code_type = ''
- 'file_name' (str): The name of the file extracted from the preceding line, if available next_newline = remaining_text.find('\n', start_pos + 3)
- 'content' (str): The content of the code block if next_newline != -1:
- 'type' (str): The type of the code block potential_type = remaining_text[start_pos + 3:next_newline].strip()
- 'is_complete' (bool): True if the block has a closing tag, False otherwise if potential_type:
""" code_type = potential_type
remaining = text start_pos = next_newline + 1
bloc_index = 0 else:
first_index = 0 start_pos += 3
indices = [] else:
text_without_blocks = text start_pos += 3
# Find all code block delimiters # Find matching end tag
while len(remaining) > 0: tag_count = 1
try: pos = start_pos
index = remaining.index("```") content_start = start_pos
indices.append(index + first_index) is_complete = False
remaining = remaining[index + 3:]
first_index += index + 3
bloc_index += 1
except Exception as ex:
if bloc_index % 2 == 1:
index = index+len(remaining)
indices.append(index)
remaining = ""
code_blocks = [] while pos < len(remaining_text):
is_start = True if remaining_text[pos:pos + 3] == '```':
tag_count -= 1
if tag_count == 0:
# Found matching end tag
content = remaining_text[content_start:pos].strip()
is_complete = True
codes.append({
'index': current_index,
'file_name': file_name,
'content': content,
'type': code_type,
'is_complete': True
})
remaining_text = remaining_text[pos + 3:]
break
elif remaining_text[pos:pos + 3] == '```':
tag_count += 1
pos += 1
# Process code blocks and build text without blocks if requested if not is_complete:
if return_remaining_text: # Handle incomplete code block
text_parts = [] content = remaining_text[content_start:].strip()
last_end = 0 codes.append({
'index': current_index,
for index, code_delimiter_position in enumerate(indices): 'file_name': file_name,
if is_start: 'content': content,
block_infos = { 'type': code_type,
'index': len(code_blocks),
'file_name': "",
'section': "",
'content': "",
'type': "",
'is_complete': False 'is_complete': False
} })
remaining_text = ''
# Store text before code block if returning remaining text current_index += 1
if return_remaining_text:
text_parts.append(text[last_end:code_delimiter_position].strip())
# Check the preceding line for file name
preceding_text = text[:code_delimiter_position].strip().splitlines()
if preceding_text:
last_line = preceding_text[-1].strip()
if last_line.startswith("<file_name>") and last_line.endswith("</file_name>"):
file_name = last_line[len("<file_name>"):-len("</file_name>")].strip()
block_infos['file_name'] = file_name
elif last_line.startswith("## filename:"):
file_name = last_line[len("## filename:"):].strip()
block_infos['file_name'] = file_name
if last_line.startswith("<section>") and last_line.endswith("</section>"):
section = last_line[len("<section>"):-len("</section>")].strip()
block_infos['section'] = section
sub_text = text[code_delimiter_position + 3:]
if len(sub_text) > 0:
try:
find_space = sub_text.index(" ")
except:
find_space = int(1e10)
try:
find_return = sub_text.index("\n")
except:
find_return = int(1e10)
next_index = min(find_return, find_space)
if '{' in sub_text[:next_index]:
next_index = 0
start_pos = next_index
if code_delimiter_position + 3 < len(text) and text[code_delimiter_position + 3] in ["\n", " ", "\t"]:
block_infos["type"] = 'language-specific'
else:
block_infos["type"] = sub_text[:next_index]
if index + 1 < len(indices):
next_pos = indices[index + 1] - code_delimiter_position
if next_pos - 3>0:
if next_pos - 3 < len(sub_text) and sub_text[next_pos - 3] == "`":
block_infos["content"] = sub_text[start_pos:next_pos - 3].strip()
block_infos["is_complete"] = True
else:
block_infos["content"] = sub_text[start_pos:next_pos].strip()
block_infos["is_complete"] = False
if return_remaining_text: if return_remaining_text:
last_end = indices[index + 1] + 3 return codes, remaining_text
else: return codes
block_infos["content"] = sub_text[start_pos:].strip()
block_infos["is_complete"] = False
if return_remaining_text:
last_end = len(text)
code_blocks.append(block_infos)
is_start = False
else:
is_start = True
if return_remaining_text:
# Add any remaining text after the last code block
if last_end < len(text):
text_parts.append(text[last_end:].strip())
# Join all non-code parts with newlines
text_without_blocks = '\n'.join(filter(None, text_parts))
return code_blocks, text_without_blocks
return code_blocks
def build_and_execute_python_code(self,context, instructions, execution_function_signature, extra_imports=""): def build_and_execute_python_code(self,context, instructions, execution_function_signature, extra_imports=""):