Spaces:

vishwask
/

rag

Sleeping

vishwask commited on Apr 20, 2024

Commit

c4e61a2

verified ·

1 Parent(s): fd3b58c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from langchain.memory import ConversationBufferMemory
 from langchain.llms import HuggingFaceHub
 from langchain.memory import ConversationBufferWindowMemory
 from langchain_community.document_loaders import TextLoader
 from pathlib import Path
 import chromadb
@@ -80,7 +82,7 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
     # Processing for one document only
     # loader = PyPDFLoader(file_path)
     # pages = loader.load()
-    loaders = [PyPDFLoader(x) for x in list_file_path]
     pages = []
     for loader in loaders:
         pages.extend(loader.load())
@@ -227,16 +229,16 @@ def demo():
         vector_db = gr.State()
         qa_chain = gr.State()
         collection_name = gr.State()
-        pdf_directory = '/home/user/app/pdfs/'
         def process_pdfs():
             # List all PDF files in the directory
-            pdf_files = [os.path.join(pdf_directory, file) for file in os.listdir(pdf_directory) if file.endswith(".pdf")]
             return pdf_files
         # Create a dictionary with the necessary information
         pdf_dict = {"value": process_pdfs, "height": 100, "file_count": "multiple",
-                    "visible": False, "file_types": ["pdf"], "interactive": True,
                     "label": "Uploaded PDF documents"}
         # Create a gr.Files component with the dictionary

 from langchain.llms import HuggingFaceHub
 from langchain.memory import ConversationBufferWindowMemory
 from langchain_community.document_loaders import TextLoader
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_community.document_loaders import UnstructuredHTMLLoader
 from pathlib import Path
 import chromadb
     # Processing for one document only
     # loader = PyPDFLoader(file_path)
     # pages = loader.load()
+    loaders = [UnstructuredHTMLLoader(x) for x in list_file_path]
     pages = []
     for loader in loaders:
         pages.extend(loader.load())
         vector_db = gr.State()
         qa_chain = gr.State()
         collection_name = gr.State()
+        pdf_directory = '/home/user/app/htmls/'
         def process_pdfs():
             # List all PDF files in the directory
+            pdf_files = [os.path.join(pdf_directory, file) for file in os.listdir(pdf_directory) if file.endswith(".html")]
             return pdf_files
         # Create a dictionary with the necessary information
         pdf_dict = {"value": process_pdfs, "height": 100, "file_count": "multiple",
+                    "visible": False, "file_types": ["html"], "interactive": True,
                     "label": "Uploaded PDF documents"}
         # Create a gr.Files component with the dictionary