fixed bugs

2024-12-19 04:37:54 +00:00 · 2023-10-26 00:55:59 +02:00 · 2023-10-26 00:55:59 +02:00 · 239955db8b
commit 239955db8b
parent 688d70a221
2 changed files with 30 additions and 127 deletions
--- a/lollms/personality.py
+++ b/lollms/personality.py
@ -1493,20 +1493,35 @@ class APScript(StateMachine):
        ASCIIColors.yellow(prompt)
        ASCIIColors.red(" *-*-*-*-*-*-*-*")        

-    def fast_gen(self, prompt, max_generation_size, placeholders={}, debug=False):
+    def fast_gen(self, prompt: str, max_generation_size: int, placeholders: dict = {}, sacrifice: list = ["previous_discussion"], debug: bool = False) -> str:
        """
        Fast way to generate code
+        
+        This method takes in a prompt, maximum generation size, optional placeholders, sacrifice list, and debug flag.
+        It reshapes the context before performing text generation by adjusting and cropping the number of tokens.
+        
+        Parameters:
+        - prompt (str): The input prompt for text generation.
+        - max_generation_size (int): The maximum number of tokens to generate.
+        - placeholders (dict, optional): A dictionary of placeholders to be replaced in the prompt. Defaults to an empty dictionary.
+        - sacrifice (list, optional): A list of placeholders to sacrifice if the window is bigger than the context size minus the number of tokens to generate. Defaults to ["previous_discussion"].
+        - debug (bool, optional): Flag to enable/disable debug mode. Defaults to False.
+        
+        Returns:
+        - str: The generated text after removing special tokens ("<s>" and "</s>") and stripping any leading/trailing whitespace.
        """
        pr = PromptReshaper(prompt)
        prompt = pr.build(placeholders, 
                        self.personality.model.tokenize, 
                        self.personality.model.detokenize, 
-                    self.personality.model.config.ctx_size-max_generation_size,
-                    ["previous_discussion"]
+                        self.personality.model.config.ctx_size - max_generation_size,
+                        sacrifice
                        )
-            if self.personality.config.get("debug",False):
-                self.print_prompt("prompt",prompt)
-            return self.generate(prompt, max_generation_size).strip().replace("</s>","").replace("<s>","")
+        if debug:
+            self.print_prompt("prompt", prompt)
+            
+        return self.generate(prompt, max_generation_size).strip().replace("</s>", "").replace("<s>", "")
+
    

    #Helper method to convert outputs path to url
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -6,6 +6,7 @@ import json
 import re
 import subprocess
 import gc
+from typing import List

 class NumpyEncoderDecoder(json.JSONEncoder):
    def default(self, obj):
@ -677,119 +678,6 @@ class TextVectorizer:
            self.save_to_json()
            
      
-class GenericDataLoader:
-    @staticmethod        
-    def read_file(file_path:Path):
-        if file_path.suffix ==".pdf":
-            return GenericDataLoader.read_pdf_file(file_path)
-        elif file_path.suffix == ".docx":
-            return GenericDataLoader.read_docx_file(file_path)
-        elif file_path.suffix == ".json":
-            return GenericDataLoader.read_json_file(file_path)
-        elif file_path.suffix == ".html":
-            return GenericDataLoader.read_html_file(file_path)
-        elif file_path.suffix == ".pptx":
-            return GenericDataLoader.read_pptx_file(file_path)
-        if file_path.suffix in [".txt", ".rtf", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat"]:
-            return GenericDataLoader.read_text_file(file_path)
-        else:
-            raise ValueError("Unknown file type")
-    def get_supported_file_types():
-        return ["pdf", "txt", "docx", "json", "html", "pptx",".txt", ".md", ".log", ".cpp", ".java", ".js", ".py", ".rb", ".sh", ".sql", ".css", ".html", ".php", ".json", ".xml", ".yaml", ".yml", ".h", ".hh", ".hpp", ".inc", ".snippet", ".snippets", ".asm", ".s", ".se", ".sym", ".ini", ".inf", ".map", ".bat", ".rtf"]    
-    @staticmethod        
-    def read_pdf_file(file_path):
-        try:
-            import PyPDF2
-            from PIL import Image, UnidentifiedImageError
-            import pytesseract
-            import pdfminer
-            from pdfminer.high_level import extract_text
-        except ImportError:
-            PackageManager.install_package("PyPDF2")
-            PackageManager.install_package("pytesseract")
-            PackageManager.install_package("pillow")
-            PackageManager.install_package("pdfminer")
-            PackageManager.install_package("pdfminer.six")
-            
-            import PyPDF2
-            from PIL import Image, UnidentifiedImageError
-            import pytesseract
-            from pdfminer.high_level import extract_text
-            
-        # Extract text from the PDF
-        text = extract_text(file_path)
-
-        # Convert to Markdown (You may need to implement custom logic based on your specific use case)
-        markdown_text = text.replace('\n', '  \n')  # Adding double spaces at the end of each line for Markdown line breaks
-        
-        return markdown_text
-
-    @staticmethod
-    def read_docx_file(file_path):
-        try:
-            from docx import Document
-        except ImportError:
-            PackageManager.install_package("python-docx")
-            from docx import Document
-        doc = Document(file_path)
-        text = ""
-        for paragraph in doc.paragraphs:
-            text += paragraph.text + "\n"
-        return text
-
-    @staticmethod
-    def read_json_file(file_path):
-        import json
-        with open(file_path, 'r') as file:
-            data = json.load(file)
-        return data
-    
-    @staticmethod
-    def read_csv_file(file_path):
-        try:
-            import csv
-        except ImportError:
-            PackageManager.install_package("csv")
-            import csv
-        with open(file_path, 'r') as file:
-            csv_reader = csv.reader(file)
-            lines = [row for row in csv_reader]
-        return lines    
-
-    @staticmethod
-    def read_html_file(file_path):
-        try:
-            from bs4 import BeautifulSoup
-        except ImportError:
-            PackageManager.install_package("beautifulsoup4")
-            from bs4 import BeautifulSoup
-        with open(file_path, 'r') as file:
-            soup = BeautifulSoup(file, 'html.parser')
-            text = soup.get_text()
-        return text
-    
-    @staticmethod
-    def read_pptx_file(file_path):
-        try:
-            from pptx import Presentation
-        except ImportError:
-            PackageManager.install_package("python-pptx")
-            from pptx import Presentation
-        prs = Presentation(file_path)
-        text = ""
-        for slide in prs.slides:
-            for shape in slide.shapes:
-                if shape.has_text_frame:
-                    for paragraph in shape.text_frame.paragraphs:
-                        for run in paragraph.runs:
-                            text += run.text
-        return text
-    
-    @staticmethod
-    def read_text_file(file_path):
-        with open(file_path, 'r', encoding='utf-8') as file:
-            content = file.read()
-        return content