CosmickVisions commited on
Commit
1170bf0
·
verified ·
1 Parent(s): 3ea1edc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -59
app.py CHANGED
@@ -323,6 +323,64 @@ def prediction_input_form(features, default_values=None):
323
  input_data[feature] = st.number_input(f"{feature}:", value=default_value)
324
  return input_data
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  # --------------------------
327
  # Sidebar Navigation
328
  # --------------------------
@@ -1723,62 +1781,4 @@ elif app_mode == "PDF Analysis":
1723
  results = perform_semantic_search(search_query, results)
1724
  st.write("Most relevant documents:")
1725
  for doc in results[:3]:
1726
- st.write(f"📄 {doc['filename']} - Score: {doc['similarity']:.2f}")
1727
-
1728
- # Enhanced Helper Functions
1729
- def extract_text_from_pdf(pdf_file, use_ocr=False):
1730
- """Extract text with OCR support"""
1731
- try:
1732
- import pdfplumber
1733
- with pdfplumber.open(pdf_file) as pdf:
1734
- text = "\n".join([page.extract_text() for page in pdf.pages])
1735
-
1736
- if use_ocr or len(text) < 50: # Fallback to OCR
1737
- import fitz # PyMuPDF
1738
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
1739
- text = ""
1740
- for page in doc:
1741
- text += page.get_text("text")
1742
- if len(text) < 50:
1743
- raise ValueError("Likely scanned document - enable OCR")
1744
- return text
1745
- except Exception as e:
1746
- raise RuntimeError(f"Text extraction failed: {str(e)}")
1747
-
1748
- def visualize_entities(text):
1749
- """Create interactive entity visualization"""
1750
- import spacy
1751
- from spacy import displacy
1752
- nlp = spacy.load("en_core_web_sm")
1753
- doc = nlp(text)
1754
- html = displacy.render(doc, style="ent", page=True)
1755
- return html
1756
-
1757
- def generate_embeddings(text):
1758
- """Generate document embeddings"""
1759
- from sentence_transformers import SentenceTransformer
1760
- model = SentenceTransformer('all-MiniLM-L6-v2')
1761
- return model.encode(text).tolist()
1762
-
1763
- def extract_metadata(pdf_file):
1764
- """Extract PDF metadata"""
1765
- import fitz
1766
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
1767
- return {
1768
- "author": doc.metadata.get("author"),
1769
- "title": doc.metadata.get("title"),
1770
- "pages": len(doc),
1771
- "created": doc.metadata.get("creationDate"),
1772
- "modified": doc.metadata.get("modDate")
1773
- }
1774
-
1775
- def perform_semantic_search(query, docs):
1776
- """Semantic search using embeddings"""
1777
- from sentence_transformers import util
1778
- model = SentenceTransformer('all-MiniLM-L6-v2')
1779
- query_embedding = model.encode(query)
1780
-
1781
- for doc in docs:
1782
- doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
1783
-
1784
- return sorted(docs, key=lambda x: x["similarity"], reverse=True)
 
323
  input_data[feature] = st.number_input(f"{feature}:", value=default_value)
324
  return input_data
325
 
326
+ # Enhanced Helper Functions
327
+ def extract_text_from_pdf(pdf_file, use_ocr=False):
328
+ """Extract text with OCR support"""
329
+ try:
330
+ import pdfplumber
331
+ with pdfplumber.open(pdf_file) as pdf:
332
+ text = "\n".join([page.extract_text() for page in pdf.pages])
333
+
334
+ if use_ocr or len(text) < 50: # Fallback to OCR
335
+ import fitz # PyMuPDF
336
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
337
+ text = ""
338
+ for page in doc:
339
+ text += page.get_text("text")
340
+ if len(text) < 50:
341
+ raise ValueError("Likely scanned document - enable OCR")
342
+ return text
343
+ except Exception as e:
344
+ raise RuntimeError(f"Text extraction failed: {str(e)}")
345
+
346
+ def visualize_entities(text):
347
+ """Create interactive entity visualization"""
348
+ import spacy
349
+ from spacy import displacy
350
+ nlp = spacy.load("en_core_web_sm")
351
+ doc = nlp(text)
352
+ html = displacy.render(doc, style="ent", page=True)
353
+ return html
354
+
355
+ def generate_embeddings(text):
356
+ """Generate document embeddings"""
357
+ from sentence_transformers import SentenceTransformer
358
+ model = SentenceTransformer('all-MiniLM-L6-v2')
359
+ return model.encode(text).tolist()
360
+
361
+ def extract_metadata(pdf_file):
362
+ """Extract PDF metadata"""
363
+ import fitz
364
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
365
+ return {
366
+ "author": doc.metadata.get("author"),
367
+ "title": doc.metadata.get("title"),
368
+ "pages": len(doc),
369
+ "created": doc.metadata.get("creationDate"),
370
+ "modified": doc.metadata.get("modDate")
371
+ }
372
+
373
+ def perform_semantic_search(query, docs):
374
+ """Semantic search using embeddings"""
375
+ from sentence_transformers import util
376
+ model = SentenceTransformer('all-MiniLM-L6-v2')
377
+ query_embedding = model.encode(query)
378
+
379
+ for doc in docs:
380
+ doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
381
+
382
+ return sorted(docs, key=lambda x: x["similarity"], reverse=True)
383
+
384
  # --------------------------
385
  # Sidebar Navigation
386
  # --------------------------
 
1781
  results = perform_semantic_search(search_query, results)
1782
  st.write("Most relevant documents:")
1783
  for doc in results[:3]:
1784
+ st.write(f"📄 {doc['filename']} - Score: {doc['similarity']:.2f}")