Spaces:

raj999
/

rag_for_all

Sleeping

raj999 commited on Sep 22, 2024

Commit

af50095

verified ·

1 Parent(s): 26d62fd

Update app.py

corrected unstructured

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFaceHub
 from langchain.chains import ConversationalRetrievalChain
-from unstructured.documents import from_pdf
 import camelot
 from pathlib import Path
@@ -16,10 +16,14 @@ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-b
 vector_store = None
 retriever = None
 def extract_text_from_pdf(filepath):
     # Use unstructured to read text from the PDF
-    documents = from_pdf(filepath)
-    return "\n".join([doc.text for doc in documents])
 def extract_tables_from_pdf(filepath):
     # Use camelot to read tables from the PDF

 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFaceHub
 from langchain.chains import ConversationalRetrievalChain
+from langchain_unstructured import UnstructuredLoader
 import camelot
 from pathlib import Path
 vector_store = None
 retriever = None
 def extract_text_from_pdf(filepath):
     # Use unstructured to read text from the PDF
+    loader = UnstructuredLoader(file_path)
+    pages = []
+    for doc in loader.lazy_load():
+        pages.append(doc)
+    return "\n".join([page.text for page in pages])
 def extract_tables_from_pdf(filepath):
     # Use camelot to read tables from the PDF