fixed bugs

2024-12-19 04:37:54 +00:00 · 2023-10-26 00:55:59 +02:00 · 2023-10-26 00:55:59 +02:00 · 239955db8b
commit 239955db8b
parent 688d70a221
2 changed files with 30 additions and 127 deletions
--- a/lollms/personality.py
+++ b/lollms/personality.py
@ -1493,20 +1493,35 @@ class APScript(StateMachine):
        ASCIIColors.yellow(prompt)
        ASCIIColors.red(" *-*-*-*-*-*-*-*")        
-    def fast_gen(self, prompt, max_generation_size, placeholders={}, debug=False):
+    def fast_gen(self, prompt: str, max_generation_size: int, placeholders: dict = {}, sacrifice: list = ["previous_discussion"], debug: bool = False) -> str:
-            """
+        """
-            Fast way to generate code
+        Fast way to generate code
-            """
+        
-            pr  = PromptReshaper(prompt)
+        This method takes in a prompt, maximum generation size, optional placeholders, sacrifice list, and debug flag.
-            prompt = pr.build(placeholders, 
+        It reshapes the context before performing text generation by adjusting and cropping the number of tokens.
-                    self.personality.model.tokenize, 
+        
-                    self.personality.model.detokenize, 
+        Parameters:
-                    self.personality.model.config.ctx_size-max_generation_size,
+        - prompt (str): The input prompt for text generation.
-                    ["previous_discussion"]
+        - max_generation_size (int): The maximum number of tokens to generate.
-                    )
+        - placeholders (dict, optional): A dictionary of placeholders to be replaced in the prompt. Defaults to an empty dictionary.
-            if self.personality.config.get("debug",False):
+        - sacrifice (list, optional): A list of placeholders to sacrifice if the window is bigger than the context size minus the number of tokens to generate. Defaults to ["previous_discussion"].
-                self.print_prompt("prompt",prompt)
+        - debug (bool, optional): Flag to enable/disable debug mode. Defaults to False.
-            return self.generate(prompt, max_generation_size).strip().replace("</s>","").replace("<s>","")
+        
        Returns:
        - str: The generated text after removing special tokens ("<s>" and "</s>") and stripping any leading/trailing whitespace.
        """
        pr = PromptReshaper(prompt)
        prompt = pr.build(placeholders, 
                        self.personality.model.tokenize, 
                        self.personality.model.detokenize, 
                        self.personality.model.config.ctx_size - max_generation_size,
                        sacrifice
                        )
        if debug:
            self.print_prompt("prompt", prompt)
        return self.generate(prompt, max_generation_size).strip().replace("</s>", "").replace("<s>", "")
    #Helper method to convert outputs path to url
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -6,6 +6,7 @@ import json
 import re
 import subprocess
 import gc
 from typing import List
 class NumpyEncoderDecoder(json.JSONEncoder):
    def default(self, obj):
@ -677,119 +678,6 @@ class TextVectorizer:
            self.save_to_json()
 class GenericDataLoader:
    @staticmethod        
    def read_file(file_path:Path):
        if file_path.suffix ==".pdf":
            return GenericDataLoader.read_pdf_file(file_path)
        elif file_path.suffix == ".docx":
            return GenericDataLoader.read_docx_file(file_path)
        elif file_path.suffix == ".json":
            return GenericDataLoader.read_json_file(file_path)
        elif file_path.suffix == ".html":
            return GenericDataLoader.read_html_file(file_path)
        elif file_path.suffix == ".pptx":
            return GenericDataLoader.read_pptx_file(file_path)
        if file_path.suffix in [".txt", ".rtf", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
            return GenericDataLoader.read_text_file(file_path)
        else:
            raise ValueError("Unknown file type")
    def get_supported_file_types():
        return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat", ".rtf"]    
    @staticmethod        
    def read_pdf_file(file_path):
        try:
            import PyPDF2
            from PIL import Image, UnidentifiedImageError
            import pytesseract
            import pdfminer
            from pdfminer.high_level import extract_text
        except ImportError:
            PackageManager.install_package("PyPDF2")
            PackageManager.install_package("pytesseract")
            PackageManager.install_package("pillow")
            PackageManager.install_package("pdfminer")
            PackageManager.install_package("pdfminer.six")
            import PyPDF2
            from PIL import Image, UnidentifiedImageError
            import pytesseract
            from pdfminer.high_level import extract_text
        # Extract text from the PDF
        text = extract_text(file_path)
        # Convert to Markdown (You may need to implement custom logic based on your specific use case)
        markdown_text = text.replace('\n', '  \n')  # Adding double spaces at the end of each line for Markdown line breaks
        return markdown_text
    @staticmethod
    def read_docx_file(file_path):
        try:
            from docx import Document
        except ImportError:
            PackageManager.install_package("python-docx")
            from docx import Document
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    @staticmethod
    def read_json_file(file_path):
        import json
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    @staticmethod
    def read_csv_file(file_path):
        try:
            import csv
        except ImportError:
            PackageManager.install_package("csv")
            import csv
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            lines = [row for row in csv_reader]
        return lines    
    @staticmethod
    def read_html_file(file_path):
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            PackageManager.install_package("beautifulsoup4")
            from bs4 import BeautifulSoup
        with open(file_path, 'r') as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
        return text
    @staticmethod
    def read_pptx_file(file_path):
        try:
            from pptx import Presentation
        except ImportError:
            PackageManager.install_package("python-pptx")
            from pptx import Presentation
        prs = Presentation(file_path)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    for paragraph in shape.text_frame.paragraphs:
                        for run in paragraph.runs:
                            text += run.text
        return text
    @staticmethod
    def read_text_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content