Spaces:
Sleeping
Sleeping
Update app.py
Browse filescorrected unstructured
app.py
CHANGED
@@ -5,7 +5,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.llms import HuggingFaceHub
|
7 |
from langchain.chains import ConversationalRetrievalChain
|
8 |
-
from
|
9 |
import camelot
|
10 |
from pathlib import Path
|
11 |
|
@@ -16,10 +16,14 @@ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-b
|
|
16 |
vector_store = None
|
17 |
retriever = None
|
18 |
|
|
|
19 |
def extract_text_from_pdf(filepath):
|
20 |
# Use unstructured to read text from the PDF
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
def extract_tables_from_pdf(filepath):
|
25 |
# Use camelot to read tables from the PDF
|
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.llms import HuggingFaceHub
|
7 |
from langchain.chains import ConversationalRetrievalChain
|
8 |
+
from langchain_unstructured import UnstructuredLoader
|
9 |
import camelot
|
10 |
from pathlib import Path
|
11 |
|
|
|
16 |
vector_store = None
|
17 |
retriever = None
|
18 |
|
19 |
+
|
20 |
def extract_text_from_pdf(filepath):
|
21 |
# Use unstructured to read text from the PDF
|
22 |
+
loader = UnstructuredLoader(file_path)
|
23 |
+
pages = []
|
24 |
+
for doc in loader.lazy_load():
|
25 |
+
pages.append(doc)
|
26 |
+
return "\n".join([page.text for page in pages])
|
27 |
|
28 |
def extract_tables_from_pdf(filepath):
|
29 |
# Use camelot to read tables from the PDF
|