raj999 commited on
Commit
af50095
·
verified ·
1 Parent(s): 26d62fd

Update app.py

Browse files

corrected unstructured

Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -5,7 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.vectorstores import FAISS
6
  from langchain.llms import HuggingFaceHub
7
  from langchain.chains import ConversationalRetrievalChain
8
- from unstructured.documents import from_pdf
9
  import camelot
10
  from pathlib import Path
11
 
@@ -16,10 +16,14 @@ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-b
16
  vector_store = None
17
  retriever = None
18
 
 
19
  def extract_text_from_pdf(filepath):
20
  # Use unstructured to read text from the PDF
21
- documents = from_pdf(filepath)
22
- return "\n".join([doc.text for doc in documents])
 
 
 
23
 
24
  def extract_tables_from_pdf(filepath):
25
  # Use camelot to read tables from the PDF
 
5
  from langchain.vectorstores import FAISS
6
  from langchain.llms import HuggingFaceHub
7
  from langchain.chains import ConversationalRetrievalChain
8
+ from langchain_unstructured import UnstructuredLoader
9
  import camelot
10
  from pathlib import Path
11
 
 
16
  vector_store = None
17
  retriever = None
18
 
19
+
20
  def extract_text_from_pdf(filepath):
21
  # Use unstructured to read text from the PDF
22
+ loader = UnstructuredLoader(file_path)
23
+ pages = []
24
+ for doc in loader.lazy_load():
25
+ pages.append(doc)
26
+ return "\n".join([page.text for page in pages])
27
 
28
  def extract_tables_from_pdf(filepath):
29
  # Use camelot to read tables from the PDF