Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -60,7 +60,9 @@ def process_pdf(pdf_file):
|
|
60 |
# read full text
|
61 |
reader = PdfReader(pdf_file.name)
|
62 |
pages = [p.extract_text() or "" for p in reader.pages]
|
63 |
-
pdf_text = "
|
|
|
|
|
64 |
|
65 |
# extract elements with images via unstructured
|
66 |
try:
|
@@ -89,12 +91,11 @@ def process_pdf(pdf_file):
|
|
89 |
# combine text chunks and image captions
|
90 |
docs = chunks + captions
|
91 |
|
92 |
-
|
93 |
vectors = embeddings.embed_documents(docs)
|
94 |
-
# FAISS.from_embeddings expects list of (text, embedding) pairs
|
95 |
pairs = list(zip(docs, vectors))
|
96 |
index = FAISS.from_embeddings(pairs)
|
97 |
-
retriever = index.as_retriever(search_kwargs={"k":2})
|
98 |
|
99 |
status = f"β
Indexed β {len(chunks)} text chunks + {len(captions)} captions"
|
100 |
return os.path.basename(pdf_file.name), status, gr.update(interactive=True)
|
|
|
60 |
# read full text
|
61 |
reader = PdfReader(pdf_file.name)
|
62 |
pages = [p.extract_text() or "" for p in reader.pages]
|
63 |
+
pdf_text = "
|
64 |
+
|
65 |
+
".join(pages)
|
66 |
|
67 |
# extract elements with images via unstructured
|
68 |
try:
|
|
|
91 |
# combine text chunks and image captions
|
92 |
docs = chunks + captions
|
93 |
|
94 |
+
# embed and index
|
95 |
vectors = embeddings.embed_documents(docs)
|
|
|
96 |
pairs = list(zip(docs, vectors))
|
97 |
index = FAISS.from_embeddings(pairs)
|
98 |
+
retriever = index.as_retriever(search_kwargs={"k": 2})
|
99 |
|
100 |
status = f"β
Indexed β {len(chunks)} text chunks + {len(captions)} captions"
|
101 |
return os.path.basename(pdf_file.name), status, gr.update(interactive=True)
|