Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -323,6 +323,64 @@ def prediction_input_form(features, default_values=None):
|
|
323 |
input_data[feature] = st.number_input(f"{feature}:", value=default_value)
|
324 |
return input_data
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
# --------------------------
|
327 |
# Sidebar Navigation
|
328 |
# --------------------------
|
@@ -1723,62 +1781,4 @@ elif app_mode == "PDF Analysis":
|
|
1723 |
results = perform_semantic_search(search_query, results)
|
1724 |
st.write("Most relevant documents:")
|
1725 |
for doc in results[:3]:
|
1726 |
-
st.write(f"📄 {doc['filename']} - Score: {doc['similarity']:.2f}")
|
1727 |
-
|
1728 |
-
# Enhanced Helper Functions
|
1729 |
-
def extract_text_from_pdf(pdf_file, use_ocr=False):
|
1730 |
-
"""Extract text with OCR support"""
|
1731 |
-
try:
|
1732 |
-
import pdfplumber
|
1733 |
-
with pdfplumber.open(pdf_file) as pdf:
|
1734 |
-
text = "\n".join([page.extract_text() for page in pdf.pages])
|
1735 |
-
|
1736 |
-
if use_ocr or len(text) < 50: # Fallback to OCR
|
1737 |
-
import fitz # PyMuPDF
|
1738 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
1739 |
-
text = ""
|
1740 |
-
for page in doc:
|
1741 |
-
text += page.get_text("text")
|
1742 |
-
if len(text) < 50:
|
1743 |
-
raise ValueError("Likely scanned document - enable OCR")
|
1744 |
-
return text
|
1745 |
-
except Exception as e:
|
1746 |
-
raise RuntimeError(f"Text extraction failed: {str(e)}")
|
1747 |
-
|
1748 |
-
def visualize_entities(text):
|
1749 |
-
"""Create interactive entity visualization"""
|
1750 |
-
import spacy
|
1751 |
-
from spacy import displacy
|
1752 |
-
nlp = spacy.load("en_core_web_sm")
|
1753 |
-
doc = nlp(text)
|
1754 |
-
html = displacy.render(doc, style="ent", page=True)
|
1755 |
-
return html
|
1756 |
-
|
1757 |
-
def generate_embeddings(text):
|
1758 |
-
"""Generate document embeddings"""
|
1759 |
-
from sentence_transformers import SentenceTransformer
|
1760 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
1761 |
-
return model.encode(text).tolist()
|
1762 |
-
|
1763 |
-
def extract_metadata(pdf_file):
|
1764 |
-
"""Extract PDF metadata"""
|
1765 |
-
import fitz
|
1766 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
1767 |
-
return {
|
1768 |
-
"author": doc.metadata.get("author"),
|
1769 |
-
"title": doc.metadata.get("title"),
|
1770 |
-
"pages": len(doc),
|
1771 |
-
"created": doc.metadata.get("creationDate"),
|
1772 |
-
"modified": doc.metadata.get("modDate")
|
1773 |
-
}
|
1774 |
-
|
1775 |
-
def perform_semantic_search(query, docs):
|
1776 |
-
"""Semantic search using embeddings"""
|
1777 |
-
from sentence_transformers import util
|
1778 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
1779 |
-
query_embedding = model.encode(query)
|
1780 |
-
|
1781 |
-
for doc in docs:
|
1782 |
-
doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
|
1783 |
-
|
1784 |
-
return sorted(docs, key=lambda x: x["similarity"], reverse=True)
|
|
|
323 |
input_data[feature] = st.number_input(f"{feature}:", value=default_value)
|
324 |
return input_data
|
325 |
|
326 |
+
# Enhanced Helper Functions
|
327 |
+
def extract_text_from_pdf(pdf_file, use_ocr=False):
|
328 |
+
"""Extract text with OCR support"""
|
329 |
+
try:
|
330 |
+
import pdfplumber
|
331 |
+
with pdfplumber.open(pdf_file) as pdf:
|
332 |
+
text = "\n".join([page.extract_text() for page in pdf.pages])
|
333 |
+
|
334 |
+
if use_ocr or len(text) < 50: # Fallback to OCR
|
335 |
+
import fitz # PyMuPDF
|
336 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
337 |
+
text = ""
|
338 |
+
for page in doc:
|
339 |
+
text += page.get_text("text")
|
340 |
+
if len(text) < 50:
|
341 |
+
raise ValueError("Likely scanned document - enable OCR")
|
342 |
+
return text
|
343 |
+
except Exception as e:
|
344 |
+
raise RuntimeError(f"Text extraction failed: {str(e)}")
|
345 |
+
|
346 |
+
def visualize_entities(text):
|
347 |
+
"""Create interactive entity visualization"""
|
348 |
+
import spacy
|
349 |
+
from spacy import displacy
|
350 |
+
nlp = spacy.load("en_core_web_sm")
|
351 |
+
doc = nlp(text)
|
352 |
+
html = displacy.render(doc, style="ent", page=True)
|
353 |
+
return html
|
354 |
+
|
355 |
+
def generate_embeddings(text):
|
356 |
+
"""Generate document embeddings"""
|
357 |
+
from sentence_transformers import SentenceTransformer
|
358 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
359 |
+
return model.encode(text).tolist()
|
360 |
+
|
361 |
+
def extract_metadata(pdf_file):
|
362 |
+
"""Extract PDF metadata"""
|
363 |
+
import fitz
|
364 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
365 |
+
return {
|
366 |
+
"author": doc.metadata.get("author"),
|
367 |
+
"title": doc.metadata.get("title"),
|
368 |
+
"pages": len(doc),
|
369 |
+
"created": doc.metadata.get("creationDate"),
|
370 |
+
"modified": doc.metadata.get("modDate")
|
371 |
+
}
|
372 |
+
|
373 |
+
def perform_semantic_search(query, docs):
|
374 |
+
"""Semantic search using embeddings"""
|
375 |
+
from sentence_transformers import util
|
376 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
377 |
+
query_embedding = model.encode(query)
|
378 |
+
|
379 |
+
for doc in docs:
|
380 |
+
doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
|
381 |
+
|
382 |
+
return sorted(docs, key=lambda x: x["similarity"], reverse=True)
|
383 |
+
|
384 |
# --------------------------
|
385 |
# Sidebar Navigation
|
386 |
# --------------------------
|
|
|
1781 |
results = perform_semantic_search(search_query, results)
|
1782 |
st.write("Most relevant documents:")
|
1783 |
for doc in results[:3]:
|
1784 |
+
st.write(f"📄 {doc['filename']} - Score: {doc['similarity']:.2f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|