Spaces:

RubenAMtz
/

ChatitoRAG

Runtime error

App Files Files Community

RubenAMtz commited on Dec 3, 2023

Commit

70c2a60

1 Parent(s): 31cbd5c

pdfloader, text cleaning, vector store, context in prompt

Browse files

Files changed (3) hide show

aimakerspace/text_utils.py +60 -1
app.py +49 -3
requirements.txt +2 -1

aimakerspace/text_utils.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import os
-from typing import List
 class TextFileLoader:
@@ -34,6 +38,61 @@ class TextFileLoader:
     def load_documents(self):
         self.load()
         return self.documents
 class CharacterTextSplitter:

 import os
+from typing import List, Union
+from pdfminer.high_level import extract_text
+import io
+from chainlit.types import AskFileResponse
+import re
 class TextFileLoader:
     def load_documents(self):
         self.load()
         return self.documents
+class PDFFileLoader(TextFileLoader):
+    def __init__(self, path: str, encoding: str = "utf-8", content=None, files: list[AskFileResponse] = None):
+        super().__init__(path, encoding)
+        self.content = content
+        self.files = files
+    def load(self):
+        if isinstance(self.files, List):
+            for file in self.files:
+                if file.content and file.path.endswith(".pdf"):
+                    self.content = file.content
+                    self.load_content()
+        elif os.path.isdir(self.path):
+            self.load_directory()
+        elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
+            print("loading file ...")
+            self.load_file()
+        elif self.content and self.path.endswith(".pdf"):
+            print("loading content ...")
+            self.load_content()
+        else:
+            raise ValueError(
+                "Provided path is neither a valid directory nor a .pdf file."
+            )
+    def load_content(self):
+        """Load pdf already in memory"""
+        text = extract_text(io.BytesIO(self.content))
+        text = self.clean_text(text)
+        self.documents.append(text)
+    def clean_text(self, text):
+        """Clean text by removing special characters."""
+        # remove all \n
+        text = text.replace('\n', ' ')
+        text = re.sub(' +', ' ', text)
+        # remove page number, we find it because it appears before '\x0c', use regex to find it
+        text = re.sub(r'\d+ \x0c', '\x0c', text)
+        # remove all '\x0c'
+        text = text.replace('\x0c', ' ')
+        return text
+    def load_file(self):
+        text = extract_text(pdf_file=self.path, codec=self.encoding)
+        self.documents.append(text)
+    def load_directory(self):
+        for root, _, files in os.walk(self.path):
+            for file in files:
+                if file.endswith(".pdf"):
+                    self.documents.append(
+                        extract_text(os.path.join(root, file), encoding=self.encoding)
+                    )
 class CharacterTextSplitter:

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ import chainlit as cl  # importing chainlit for our app
 from chainlit.prompt import Prompt, PromptMessage  # importing prompt tools
 from chainlit.playground.providers import ChatOpenAI  # importing ChatOpenAI tools
 from dotenv import load_dotenv
 load_dotenv()
@@ -18,6 +21,15 @@ user_template = """{input}
 Think through your response step by step.
 """
 @cl.on_chat_start  # marks a function that will be executed at the start of a user session
@@ -44,26 +56,52 @@ async def start_chat():
         ).send()
     # let the user know you are processing the file(s)
     # decode the file
     # split the text into chunks
     # create a vector store
-    #
 @cl.on_message  # marks a function that should be run each time the chatbot receives a message from a user
 async def main(message: cl.Message):
     settings = cl.user_session.get("settings")
     client = AsyncOpenAI()
     print(message.content)
     prompt = Prompt(
         provider=ChatOpenAI.id,
         messages=[
@@ -77,8 +115,16 @@ async def main(message: cl.Message):
                 template=user_template,
                 formatted=user_template.format(input=message.content),
             ),
         ],
-        inputs={"input": message.content},
         settings=settings,
     )

 from chainlit.prompt import Prompt, PromptMessage  # importing prompt tools
 from chainlit.playground.providers import ChatOpenAI  # importing ChatOpenAI tools
 from dotenv import load_dotenv
+from aimakerspace.text_utils import PDFFileLoader, CharacterTextSplitter
+from aimakerspace.vectordatabase import VectorDatabase
+import asyncio
 load_dotenv()
 Think through your response step by step.
 """
+assistant_template = """Use the following context, if any, to help you
+answer the user's input, if the answer is not in the context say you don't
+know the answer.
+CONTEXT:
+===============
+{context}
+===============
+"""
 @cl.on_chat_start  # marks a function that will be executed at the start of a user session
         ).send()
     # let the user know you are processing the file(s)
+    await cl.Message(
+        content="Loading your files..."
+    ).send()
     # decode the file
+    documents = PDFFileLoader(path="", files=files).load_documents()
     # split the text into chunks
+    chunks = CharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200
+    ).split_texts(documents)
+    print(chunks[0])
     # create a vector store
+    # let the user know you are processing the document(s)
+    await cl.Message(
+        content="Creating vector store"
+    ).send()
+    vector_db = VectorDatabase()
+    vector_db = await vector_db.abuild_from_list(chunks)
+    await cl.Message(
+        content="Done"
+    ).send()
+    cl.user_session.set("vector_db", vector_db)
 @cl.on_message  # marks a function that should be run each time the chatbot receives a message from a user
 async def main(message: cl.Message):
+    vector_db = cl.user_session.get("vector_db")
     settings = cl.user_session.get("settings")
     client = AsyncOpenAI()
     print(message.content)
+    results_list = vector_db.search_by_text(query_text=message.content, k=3, return_as_text=True)
+    if results_list:
+        results_string = "\n\n".join(results_list)
+    else:
+        results_string = ""
     prompt = Prompt(
         provider=ChatOpenAI.id,
         messages=[
                 template=user_template,
                 formatted=user_template.format(input=message.content),
             ),
+            PromptMessage(
+                role="assistant",
+                template=assistant_template,
+                formatted=assistant_template.format(context=results_string)
+            )
         ],
+        inputs={
+            "input": message.content,
+            "context": results_string
+            },
         settings=settings,
     )

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ numpy==1.25.2
 pandas
 scikit-learn
 matplotlib
-plotly

 pandas
 scikit-learn
 matplotlib
+plotly
+pdfminer.six