Spaces:

dgutierrez
/

HF_Deploy_RAG

Sleeping

App Files Files Community

dgutierrez commited on Aug 26

Commit

cc17218

•

1 Parent(s): e67d4b1

added pdf

Browse files

Files changed (2) hide show

aimakerspace/text_utils.py +27 -10
app.py +34 -13

aimakerspace/text_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from typing import List
 class TextFileLoader:
@@ -11,25 +12,40 @@ class TextFileLoader:
     def load(self):
         if os.path.isdir(self.path):
             self.load_directory()
-        elif os.path.isfile(self.path) and self.path.endswith(".txt"):
-            self.load_file()
         else:
-            raise ValueError(
-                "Provided path is neither a valid directory nor a .txt file."
-            )
     def load_file(self):
         with open(self.path, "r", encoding=self.encoding) as f:
             self.documents.append(f.read())
     def load_directory(self):
         for root, _, files in os.walk(self.path):
             for file in files:
                 if file.endswith(".txt"):
-                    with open(
-                        os.path.join(root, file), "r", encoding=self.encoding
-                    ) as f:
                         self.documents.append(f.read())
     def load_documents(self):
         self.load()
@@ -52,7 +68,7 @@ class CharacterTextSplitter:
     def split(self, text: str) -> List[str]:
         chunks = []
         for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
-            chunks.append(text[i : i + self.chunk_size])
         return chunks
     def split_texts(self, texts: List[str]) -> List[str]:
@@ -63,7 +79,8 @@ class CharacterTextSplitter:
 if __name__ == "__main__":
-    loader = TextFileLoader("data/KingLear.txt")
     loader.load()
     splitter = CharacterTextSplitter()
     chunks = splitter.split_texts(loader.documents)

 import os
 from typing import List
+import fitz  # PyMuPDF
 class TextFileLoader:
     def load(self):
         if os.path.isdir(self.path):
             self.load_directory()
+        elif os.path.isfile(self.path):
+            if self.path.endswith(".txt"):
+                self.load_file()
+            elif self.path.endswith(".pdf"):
+                self.load_pdf()
+            else:
+                raise ValueError("Unsupported file type. Only .txt and .pdf files are supported.")
         else:
+            raise ValueError("Provided path is neither a valid directory nor a file.")
     def load_file(self):
         with open(self.path, "r", encoding=self.encoding) as f:
             self.documents.append(f.read())
+    def load_pdf(self):
+        with fitz.open(self.path) as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text("text")
+            self.documents.append(text)
     def load_directory(self):
         for root, _, files in os.walk(self.path):
             for file in files:
+                file_path = os.path.join(root, file)
                 if file.endswith(".txt"):
+                    with open(file_path, "r", encoding=self.encoding) as f:
                         self.documents.append(f.read())
+                elif file.endswith(".pdf"):
+                    with fitz.open(file_path) as doc:
+                        text = ""
+                        for page in doc:
+                            text += page.get_text("text")
+                        self.documents.append(text)
     def load_documents(self):
         self.load()
     def split(self, text: str) -> List[str]:
         chunks = []
         for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
+            chunks.append(text[i: i + self.chunk_size])
         return chunks
     def split_texts(self, texts: List[str]) -> List[str]:
 if __name__ == "__main__":
+    # Example usage with a PDF file
+    loader = TextFileLoader("data/sample.pdf")
     loader.load()
     splitter = CharacterTextSplitter()
     chunks = splitter.split_texts(loader.documents)

app.py CHANGED Viewed

@@ -11,9 +11,10 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
 system_template = """\
-Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 system_role_prompt = SystemRolePrompt(system_template)
 user_prompt_template = """\
@@ -49,18 +50,38 @@ class RetrievalAugmentedQAPipeline:
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
     import tempfile
-    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
-        temp_file_path = temp_file.name
-    with open(temp_file_path, "wb") as f:
-        f.write(file.content)
-    text_loader = TextFileLoader(temp_file_path)
-    documents = text_loader.load_documents()
     texts = text_splitter.split_texts(documents)
     return texts
@@ -70,10 +91,10 @@ async def on_chat_start():
     files = None
     # Wait for the user to upload a file
-    while files == None:
         files = await cl.AskFileMessage(
-            content="Please upload a Text File file to begin!",
-            accept=["text/plain"],
             max_size_mb=2,
             timeout=180,
         ).send()
@@ -85,7 +106,7 @@ async def on_chat_start():
     )
     await msg.send()
-    # load the file
     texts = process_text_file(file)
     print(f"Processing {len(texts)} text chunks")
@@ -119,4 +140,4 @@ async def main(message):
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
-    await msg.send()

 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
+import fitz  # PyMuPDF for PDF reading
 system_template = """\
+Use the following context to answer a user's question. If you cannot find the answer in the context, say you don't know the answer."""
 system_role_prompt = SystemRolePrompt(system_template)
 user_prompt_template = """\
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
     import tempfile
+    file_extension = os.path.splitext(file.name)[-1].lower()
+    if file_extension == ".txt":
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
+            temp_file_path = temp_file.name
+        with open(temp_file_path, "wb") as f:
+            f.write(file.content)
+        text_loader = TextFileLoader(temp_file_path)
+        documents = text_loader.load_documents()
+    elif file_extension == ".pdf":
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file_path = temp_file.name
+        with open(temp_file_path, "wb") as f:
+            f.write(file.content)
+        documents = []
+        with fitz.open(temp_file_path) as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text("text")
+            documents.append(text)
+    else:
+        raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
     texts = text_splitter.split_texts(documents)
     return texts
     files = None
     # Wait for the user to upload a file
+    while files is None:
         files = await cl.AskFileMessage(
+            content="Please upload a Text File or PDF to begin!",
+            accept=["text/plain", "application/pdf"],
             max_size_mb=2,
             timeout=180,
         ).send()
     )
     await msg.send()
+    # Load the file
     texts = process_text_file(file)
     print(f"Processing {len(texts)} text chunks")
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
+    await msg.send()