CosmickVisions commited on
Commit
bef6efa
ยท
verified ยท
1 Parent(s): 022a14a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -3
app.py CHANGED
@@ -329,9 +329,10 @@ def prediction_input_form(features, default_values=None):
329
  with st.sidebar:
330
  st.title("๐Ÿ”ฎ DataInsight Pro")
331
  app_mode = st.selectbox(
332
- "Navigation",
333
- ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
334
- format_func=lambda x: f"๐Ÿ“Œ {x}"
 
335
  )
336
  st.markdown("---")
337
  st.markdown("Created by Calvin Allen-Crawford")
@@ -1571,3 +1572,213 @@ elif app_mode == "Predictions":
1571
 
1572
  except Exception as e:
1573
  st.error(f"Prediction failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  with st.sidebar:
330
  st.title("๐Ÿ”ฎ DataInsight Pro")
331
  app_mode = st.selectbox(
332
+ "Navigation",
333
+ ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions", "PDF Analysis"],
334
+ format_func=lambda x: f"๐Ÿ“Œ {x}"
335
+ )
336
  )
337
  st.markdown("---")
338
  st.markdown("Created by Calvin Allen-Crawford")
 
1572
 
1573
  except Exception as e:
1574
  st.error(f"Prediction failed: {str(e)}")
1575
+
1576
+
1577
+
1578
+ elif app_mode == "PDF Analysis":
1579
+ st.title("๐Ÿ“„ Advanced PDF Analyzer")
1580
+
1581
+ # PDF Upload with drag & drop zone
1582
+ with st.container(border=True):
1583
+ uploaded_pdfs = st.file_uploader("Drag & Drop PDF Files",
1584
+ type="pdf",
1585
+ accept_multiple_files=True,
1586
+ help="Upload multiple PDF documents for analysis")
1587
+
1588
+ if uploaded_pdfs:
1589
+ # Enhanced processing options
1590
+ with st.expander("โš™๏ธ Analysis Configuration", expanded=True):
1591
+ col1, col2, col3 = st.columns(3)
1592
+ with col1:
1593
+ st.subheader("Text Options")
1594
+ extract_mode = st.radio("Extraction Mode", ["Full Text", "Key Sections"])
1595
+ ocr_enabled = st.checkbox("Enable OCR (for scanned PDFs)", False)
1596
+ chunk_size = st.slider("Chunk Size (characters)", 500, 5000, 2000)
1597
+
1598
+ with col2:
1599
+ st.subheader("NLP Features")
1600
+ ner_analysis = st.checkbox("Named Entity Recognition", True)
1601
+ ner_types = st.multiselect("Entity Types to Show",
1602
+ ["PERSON", "ORG", "GPE", "DATE", "MONEY"],
1603
+ default=["PERSON", "ORG"])
1604
+ summary_length = st.select_slider("Summary Length",
1605
+ options=["Short", "Medium", "Long"],
1606
+ value="Medium")
1607
+
1608
+ with col3:
1609
+ st.subheader("Advanced")
1610
+ create_embeddings = st.checkbox("Generate Document Embeddings")
1611
+ semantic_search = st.checkbox("Enable Semantic Search")
1612
+ show_metadata = st.checkbox("Show Document Metadata", True)
1613
+
1614
+ # Security notice
1615
+ st.info("๐Ÿ”’ Documents are processed in memory and never stored permanently")
1616
+
1617
+ if st.button("๐Ÿš€ Start Analysis", type="primary"):
1618
+ results = []
1619
+ with st.spinner("Analyzing documents...") and stqdm(uploaded_pdfs) as pbar:
1620
+ for pdf in pbar:
1621
+ try:
1622
+ # PDF Processing with error handling
1623
+ pdf_text = extract_text_from_pdf(pdf, ocr_enabled)
1624
+
1625
+ # Handle large documents with chunking
1626
+ chunks = [pdf_text[i:i+chunk_size]
1627
+ for i in range(0, len(pdf_text), chunk_size)]
1628
+
1629
+ doc_data = {
1630
+ "filename": pdf.name,
1631
+ "metadata": extract_metadata(pdf),
1632
+ "chunks": chunks,
1633
+ "content": pdf_text,
1634
+ "entities": [],
1635
+ "summary": "",
1636
+ "embeddings": None
1637
+ }
1638
+
1639
+ # Named Entity Recognition with filtering
1640
+ if ner_analysis:
1641
+ entities = perform_ner(pdf_text).query("Type in @ner_types")
1642
+ doc_data["entities"] = entities
1643
+
1644
+ # Generate entity visualization
1645
+ doc_data["entity_viz"] = visualize_entities(pdf_text)
1646
+
1647
+ # Adaptive summarization
1648
+ if len(pdf_text) > 1000:
1649
+ doc_data["summary"] = summarize_text(
1650
+ pdf_text,
1651
+ summary_length
1652
+ )
1653
+ else:
1654
+ doc_data["summary"] = "Text too short for summarization"
1655
+
1656
+ # Generate embeddings if enabled
1657
+ if create_embeddings:
1658
+ doc_data["embeddings"] = generate_embeddings(pdf_text)
1659
+
1660
+ results.append(doc_data)
1661
+ except Exception as e:
1662
+ st.error(f"Failed to process {pdf.name}: {str(e)}")
1663
+
1664
+ # Display Results in Interactive Dashboard
1665
+ st.subheader("Analysis Dashboard")
1666
+ tab1, tab2, tab3 = st.tabs(["Documents", "Entity Explorer", "Semantic Search"])
1667
+
1668
+ with tab1:
1669
+ for doc in results:
1670
+ with st.expander(f"๐Ÿ“‘ {doc['filename']}", expanded=False):
1671
+ col1, col2 = st.columns([2, 1])
1672
+
1673
+ with col1:
1674
+ st.subheader("Document Overview")
1675
+
1676
+ if show_metadata:
1677
+ st.markdown("**Metadata**")
1678
+ st.json(doc["metadata"])
1679
+
1680
+ st.markdown("**Key Summary**")
1681
+ st.write(doc["summary"])
1682
+
1683
+ st.markdown("**Text Preview**")
1684
+ st.text(doc["content"][:2000] + "...")
1685
+
1686
+ with col2:
1687
+ st.markdown("**Entity Analysis**")
1688
+ if not doc["entities"].empty:
1689
+ # Entity frequency chart
1690
+ fig = px.bar(doc["entities"],
1691
+ x="Count", y="Entity",
1692
+ color="Type", orientation='h')
1693
+ st.plotly_chart(fig, use_container_width=True)
1694
+
1695
+ # Interactive entity selector
1696
+ selected_entity = st.selectbox(
1697
+ "Explore Entity Context",
1698
+ doc["entities"]["Entity"].unique()
1699
+ )
1700
+ entity_context = get_entity_context(
1701
+ doc["content"], selected_entity)
1702
+ st.write(f"**{selected_entity} Context:**")
1703
+ st.caption(entity_context)
1704
+
1705
+ # Embedding download
1706
+ if create_embeddings:
1707
+ st.download_button(
1708
+ label="โฌ‡๏ธ Download Embeddings",
1709
+ data=pd.Series(doc["embeddings"]).to_csv(),
1710
+ file_name=f"{pdf.name}_embeddings.csv"
1711
+ )
1712
+
1713
+ with tab2:
1714
+ st.subheader("Entity Network Analysis")
1715
+ if results:
1716
+ all_entities = pd.concat([doc["entities"] for doc in results])
1717
+ create_entity_network(all_entities)
1718
+
1719
+ with tab3 if semantic_search else tab3:
1720
+ st.subheader("Semantic Search")
1721
+ search_query = st.text_input("Enter semantic search query")
1722
+ if search_query:
1723
+ results = perform_semantic_search(search_query, results)
1724
+ st.write("Most relevant documents:")
1725
+ for doc in results[:3]:
1726
+ st.write(f"๐Ÿ“„ {doc['filename']} - Score: {doc['similarity']:.2f}")
1727
+
1728
+ # Enhanced Helper Functions
1729
+ def extract_text_from_pdf(pdf_file, use_ocr=False):
1730
+ """Extract text with OCR support"""
1731
+ try:
1732
+ import pdfplumber
1733
+ with pdfplumber.open(pdf_file) as pdf:
1734
+ text = "\n".join([page.extract_text() for page in pdf.pages])
1735
+
1736
+ if use_ocr or len(text) < 50: # Fallback to OCR
1737
+ import fitz # PyMuPDF
1738
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
1739
+ text = ""
1740
+ for page in doc:
1741
+ text += page.get_text("text")
1742
+ if len(text) < 50:
1743
+ raise ValueError("Likely scanned document - enable OCR")
1744
+ return text
1745
+ except Exception as e:
1746
+ raise RuntimeError(f"Text extraction failed: {str(e)}")
1747
+
1748
+ def visualize_entities(text):
1749
+ """Create interactive entity visualization"""
1750
+ import spacy
1751
+ from spacy import displacy
1752
+ nlp = spacy.load("en_core_web_sm")
1753
+ doc = nlp(text)
1754
+ html = displacy.render(doc, style="ent", page=True)
1755
+ return html
1756
+
1757
+ def generate_embeddings(text):
1758
+ """Generate document embeddings"""
1759
+ from sentence_transformers import SentenceTransformer
1760
+ model = SentenceTransformer('all-MiniLM-L6-v2')
1761
+ return model.encode(text).tolist()
1762
+
1763
+ def extract_metadata(pdf_file):
1764
+ """Extract PDF metadata"""
1765
+ import fitz
1766
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
1767
+ return {
1768
+ "author": doc.metadata.get("author"),
1769
+ "title": doc.metadata.get("title"),
1770
+ "pages": len(doc),
1771
+ "created": doc.metadata.get("creationDate"),
1772
+ "modified": doc.metadata.get("modDate")
1773
+ }
1774
+
1775
+ def perform_semantic_search(query, docs):
1776
+ """Semantic search using embeddings"""
1777
+ from sentence_transformers import util
1778
+ model = SentenceTransformer('all-MiniLM-L6-v2')
1779
+ query_embedding = model.encode(query)
1780
+
1781
+ for doc in docs:
1782
+ doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
1783
+
1784
+ return sorted(docs, key=lambda x: x["similarity"], reverse=True)