yes

2025-01-21 03:55:01 +00:00 · 2024-06-12 23:02:30 +02:00 · 2024-06-12 23:02:30 +02:00 · a1899088f3
commit a1899088f3
parent 326cf577bf
1 changed files with 132 additions and 0 deletions
--- a/lollms/functions/knowledge/build_knowledge_db.py
+++ b/lollms/functions/knowledge/build_knowledge_db.py
@ -0,0 +1,132 @@
+from pathlib import Path
+from lollms.personality import APScript
+from safe_store.generic_data_loader import GenericDataLoader
+from safe_store.text_vectorizer import TextVectorizer
+import json
+import re
+def remove_indexing_from_markdown(markdown_text):
+    # Define a regular expression pattern to match numbered and hyphenated lists at the beginning of the line
+    pattern = r'^(?:\d+\.\s+|\d+-\s+)'
+    
+    # Use the sub() method to replace the matched pattern with an empty string
+    clean_text = re.sub(pattern, '', markdown_text.strip())
+    
+    return clean_text
+
+
+def find_last_file(folder_path):
+    i = 0
+    while True:
+        file_name = f"database_{i}.json"
+        file_path = Path(folder_path) / file_name
+        if not file_path.exists():
+            return str(f"database_{i-1}.json")
+        i += 1
+
+
+def find_available_file(folder_path):
+    i = 0
+    while True:
+        file_name = f"database_{i}.json"
+        file_path = Path(folder_path) / file_name
+        if not file_path.exists():
+            return str(file_path)
+        i += 1
+
+
+def buildKnowledgeDB(llm:APScript, data_store:TextVectorizer):
+    output_folder = llm.personality.lollms_paths.personal_outputs_path/llm.personality.name
+    output_folder.mkdir(parents=True, exist_ok=True)
+    # Verify if the data_folder_path exists
+    data_folder_path = Path(llm.personality_config.data_folder_path)
+    if not Path(data_folder_path).exists():
+        llm.warning("The specified data_folder_path does not exist.")
+    document_files = [v for v in data_folder_path.iterdir()]
+    llm.step_start(f"Loading files")
+    for file_path in document_files:
+        document_text = GenericDataLoader.read_file(file_path)
+        data_store.add_document(file_path, document_text, chunk_size=512, overlap_size=128)
+    llm.step_end(f"Loading files")
+    # Index the vector store
+    llm.step_start(f"Indexing files")
+    data_store.index()
+    llm.step_end(f"Indexing files")
+    
+    db_name = find_available_file(output_folder)
+    output = "### Building questions:\n"
+    llm.full(output)
+    # Iterate over all documents in data_folder_path
+    processed_chunks = 0
+    # Iterate over all chunks and extract text
+    questions_vector = []
+    total_chunks = len(data_store.chunks.items())
+    for chunk_name, chunk in data_store.chunks.items():
+        chunk_text = chunk["chunk_text"]
+        processed_chunks += 1
+        llm.step_start(f"Processing chunk {chunk_name}: {processed_chunks}/{total_chunks}")
+        # Build the prompt text with placeholders
+        prompt_text = f"{llm.config.start_header_id_template}instruction: Generate questions or tasks that delve into the specific details and information presented in the text chunks. Please do not ask questions about the form of the text, and do not mention the text itllm in your questions. Make sure you format the output using Markdown with each question or task placed in a separate paragraph starting with __P__.\n{llm.config.separator_template}{llm.config.start_header_id_template}chunk {{chunk_name}}: {{chunk}}{llm.config.separator_template}{llm.config.start_header_id_template}Here are some questions and tasks to further explore the contents of the given text chunks:\n__P__"
+        # Ask AI to generate questions
+        generated_text = "__P__"+llm.fast_gen(prompt_text, max_generation_size=llm.personality_config.questions_gen_size, placeholders={"chunk": chunk_text, "chunk_name":chunk_name}, debug=True)
+        # Split the generated text into lines and accumulate into questions_vector
+        generated_lines = generated_text.strip().split("__P__")
+        generated_lines = [q.replace("__P__","") for q in generated_lines]
+        generated_lines = [remove_indexing_from_markdown(q) for q in generated_lines]
+        questions_vector.extend(generated_lines)
+        llm.step_end(f"Processing chunk {chunk_name}: {processed_chunks}/{total_chunks}")
+        output += "\n<".join(generated_lines) + "\n"
+        llm.full(output)
+    
+    llm.step_start(f"Saving questions for future use")
+    with open(output_folder/f"{db_name.split('.')[0]}_q.json", 'w') as file:
+        json.dump(questions_vector, file)
+    llm.step_end(f"Saving questions for future use")
+    
+    output += "### Building answers:\n"
+    llm.full(output)
+    qna_list=[]
+    # Perform further processing with questions_vector
+    for index, question in enumerate(questions_vector):
+        docs, sorted_similarities, document_ids = data_store.recover_text(question, top_k=llm.personality_config.data_vectorization_nb_chunks) 
+        if llm.personality_config.use_enhanced_mode:
+            llm.step_start(f"Verifying RAG data_{index}")
+            prompt_text = """{llm.config.start_header_id_template}chunk: {{chunk}}
+{llm.config.start_header_id_template}instruction: Is the information provided in the above chunk sufficient to answer the following question?
+Valid answers:
+- Yes
+- No
+{llm.config.start_header_id_template}question: {{question}}
+{llm.config.start_header_id_template}answer: """
+            if "yes" not in prompt_text.lower():
+                llm.step_end(f"Verifying RAG data_{index}", False)
+                continue
+            llm.step_end(f"Verifying RAG data_{index}")
+
+        llm.step_start(f"Asking question {index}/{len(questions_vector)}")
+        prompt_text = """{llm.config.start_header_id_template}chunk: {{chunk}}
+{llm.config.start_header_id_template}instructions{llm.config.end_header_id_template}
+Interpret the textual data contained within the chunk thoroughly to answer the corresponding instruction/task presented alongside it.
+If the information stored in this chunk does not suffice to provide categorically accurate answers, please answer exactly __UNSUFFICIENT_INFORMATION__.
+All statements must be generated solely based on the available input data, discarding any assumptions beyond what has been explicitly stated.
+Do not mention the chunks, assume you are generating training data for an AI to learn from without data.
+It is crucial to maintain strict adherence to the content delineated in each instance of interaction.
+Be precise and helpful.
+{llm.config.start_header_id_template}question: {{question}}
+{llm.config.start_header_id_template}answer: """
+        # {llm.config.start_header_id_template}chunk: {{chunk}}{llm.config.separator_template}{llm.config.start_header_id_template}instruction: Please use the text chunks to answer the following question:\n{llm.config.separator_template}{llm.config.start_header_id_template}question: {{question}}\n{llm.config.separator_template}{llm.config.start_header_id_template}answer: "
+        # Ask AI to generate an answer
+        answer = llm.fast_gen(prompt_text, max_generation_size=llm.personality_config.answer_gen_size, placeholders={"chunk": "\nchunk: ".join(docs), "question": question})
+        if "UNSUFFICIENT_INFORMATION" in answer:
+            continue
+        qna_list.append({
+            "conditionning":"Act as LoLLMs expert and answer the following questions.",
+            "question":question,
+            "answer":answer,
+            "id":0
+        })
+        output += f"q:{question}\na:{answer}\n"
+        llm.full(output)
+        llm.step_end(f"Asking question {index}/{len(questions_vector)}")
+        with open(output_folder/db_name, 'w') as file:
+            json.dump(qna_list, file)
+    print("Dictionary saved as JSON successfully!")