fixed bugs

This commit is contained in:
Saifeddine ALOUI 2023-10-26 00:55:59 +02:00
parent 688d70a221
commit 239955db8b
2 changed files with 30 additions and 127 deletions

View File

@ -1493,20 +1493,35 @@ class APScript(StateMachine):
ASCIIColors.yellow(prompt) ASCIIColors.yellow(prompt)
ASCIIColors.red(" *-*-*-*-*-*-*-*") ASCIIColors.red(" *-*-*-*-*-*-*-*")
def fast_gen(self, prompt, max_generation_size, placeholders={}, debug=False): def fast_gen(self, prompt: str, max_generation_size: int, placeholders: dict = {}, sacrifice: list = ["previous_discussion"], debug: bool = False) -> str:
""" """
Fast way to generate code Fast way to generate code
"""
pr = PromptReshaper(prompt) This method takes in a prompt, maximum generation size, optional placeholders, sacrifice list, and debug flag.
prompt = pr.build(placeholders, It reshapes the context before performing text generation by adjusting and cropping the number of tokens.
self.personality.model.tokenize,
self.personality.model.detokenize, Parameters:
self.personality.model.config.ctx_size-max_generation_size, - prompt (str): The input prompt for text generation.
["previous_discussion"] - max_generation_size (int): The maximum number of tokens to generate.
) - placeholders (dict, optional): A dictionary of placeholders to be replaced in the prompt. Defaults to an empty dictionary.
if self.personality.config.get("debug",False): - sacrifice (list, optional): A list of placeholders to sacrifice if the window is bigger than the context size minus the number of tokens to generate. Defaults to ["previous_discussion"].
self.print_prompt("prompt",prompt) - debug (bool, optional): Flag to enable/disable debug mode. Defaults to False.
return self.generate(prompt, max_generation_size).strip().replace("</s>","").replace("<s>","")
Returns:
- str: The generated text after removing special tokens ("<s>" and "</s>") and stripping any leading/trailing whitespace.
"""
pr = PromptReshaper(prompt)
prompt = pr.build(placeholders,
self.personality.model.tokenize,
self.personality.model.detokenize,
self.personality.model.config.ctx_size - max_generation_size,
sacrifice
)
if debug:
self.print_prompt("prompt", prompt)
return self.generate(prompt, max_generation_size).strip().replace("</s>", "").replace("<s>", "")
#Helper method to convert outputs path to url #Helper method to convert outputs path to url

View File

@ -6,6 +6,7 @@ import json
import re import re
import subprocess import subprocess
import gc import gc
from typing import List
class NumpyEncoderDecoder(json.JSONEncoder): class NumpyEncoderDecoder(json.JSONEncoder):
def default(self, obj): def default(self, obj):
@ -677,119 +678,6 @@ class TextVectorizer:
self.save_to_json() self.save_to_json()
class GenericDataLoader:
@staticmethod
def read_file(file_path:Path):
if file_path.suffix ==".pdf":
return GenericDataLoader.read_pdf_file(file_path)
elif file_path.suffix == ".docx":
return GenericDataLoader.read_docx_file(file_path)
elif file_path.suffix == ".json":
return GenericDataLoader.read_json_file(file_path)
elif file_path.suffix == ".html":
return GenericDataLoader.read_html_file(file_path)
elif file_path.suffix == ".pptx":
return GenericDataLoader.read_pptx_file(file_path)
if file_path.suffix in [".txt", ".rtf", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
return GenericDataLoader.read_text_file(file_path)
else:
raise ValueError("Unknown file type")
def get_supported_file_types():
return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat", ".rtf"]
@staticmethod
def read_pdf_file(file_path):
try:
import PyPDF2
from PIL import Image, UnidentifiedImageError
import pytesseract
import pdfminer
from pdfminer.high_level import extract_text
except ImportError:
PackageManager.install_package("PyPDF2")
PackageManager.install_package("pytesseract")
PackageManager.install_package("pillow")
PackageManager.install_package("pdfminer")
PackageManager.install_package("pdfminer.six")
import PyPDF2
from PIL import Image, UnidentifiedImageError
import pytesseract
from pdfminer.high_level import extract_text
# Extract text from the PDF
text = extract_text(file_path)
# Convert to Markdown (You may need to implement custom logic based on your specific use case)
markdown_text = text.replace('\n', ' \n') # Adding double spaces at the end of each line for Markdown line breaks
return markdown_text
@staticmethod
def read_docx_file(file_path):
try:
from docx import Document
except ImportError:
PackageManager.install_package("python-docx")
from docx import Document
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
@staticmethod
def read_json_file(file_path):
import json
with open(file_path, 'r') as file:
data = json.load(file)
return data
@staticmethod
def read_csv_file(file_path):
try:
import csv
except ImportError:
PackageManager.install_package("csv")
import csv
with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
lines = [row for row in csv_reader]
return lines
@staticmethod
def read_html_file(file_path):
try:
from bs4 import BeautifulSoup
except ImportError:
PackageManager.install_package("beautifulsoup4")
from bs4 import BeautifulSoup
with open(file_path, 'r') as file:
soup = BeautifulSoup(file, 'html.parser')
text = soup.get_text()
return text
@staticmethod
def read_pptx_file(file_path):
try:
from pptx import Presentation
except ImportError:
PackageManager.install_package("python-pptx")
from pptx import Presentation
prs = Presentation(file_path)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text += run.text
return text
@staticmethod
def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content