Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -329,9 +329,10 @@ def prediction_input_form(features, default_values=None):
|
|
329 |
with st.sidebar:
|
330 |
st.title("๐ฎ DataInsight Pro")
|
331 |
app_mode = st.selectbox(
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
335 |
)
|
336 |
st.markdown("---")
|
337 |
st.markdown("Created by Calvin Allen-Crawford")
|
@@ -1571,3 +1572,213 @@ elif app_mode == "Predictions":
|
|
1571 |
|
1572 |
except Exception as e:
|
1573 |
st.error(f"Prediction failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
with st.sidebar:
|
330 |
st.title("๐ฎ DataInsight Pro")
|
331 |
app_mode = st.selectbox(
|
332 |
+
"Navigation",
|
333 |
+
["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions", "PDF Analysis"],
|
334 |
+
format_func=lambda x: f"๐ {x}"
|
335 |
+
)
|
336 |
)
|
337 |
st.markdown("---")
|
338 |
st.markdown("Created by Calvin Allen-Crawford")
|
|
|
1572 |
|
1573 |
except Exception as e:
|
1574 |
st.error(f"Prediction failed: {str(e)}")
|
1575 |
+
|
1576 |
+
|
1577 |
+
|
1578 |
+
elif app_mode == "PDF Analysis":
|
1579 |
+
st.title("๐ Advanced PDF Analyzer")
|
1580 |
+
|
1581 |
+
# PDF Upload with drag & drop zone
|
1582 |
+
with st.container(border=True):
|
1583 |
+
uploaded_pdfs = st.file_uploader("Drag & Drop PDF Files",
|
1584 |
+
type="pdf",
|
1585 |
+
accept_multiple_files=True,
|
1586 |
+
help="Upload multiple PDF documents for analysis")
|
1587 |
+
|
1588 |
+
if uploaded_pdfs:
|
1589 |
+
# Enhanced processing options
|
1590 |
+
with st.expander("โ๏ธ Analysis Configuration", expanded=True):
|
1591 |
+
col1, col2, col3 = st.columns(3)
|
1592 |
+
with col1:
|
1593 |
+
st.subheader("Text Options")
|
1594 |
+
extract_mode = st.radio("Extraction Mode", ["Full Text", "Key Sections"])
|
1595 |
+
ocr_enabled = st.checkbox("Enable OCR (for scanned PDFs)", False)
|
1596 |
+
chunk_size = st.slider("Chunk Size (characters)", 500, 5000, 2000)
|
1597 |
+
|
1598 |
+
with col2:
|
1599 |
+
st.subheader("NLP Features")
|
1600 |
+
ner_analysis = st.checkbox("Named Entity Recognition", True)
|
1601 |
+
ner_types = st.multiselect("Entity Types to Show",
|
1602 |
+
["PERSON", "ORG", "GPE", "DATE", "MONEY"],
|
1603 |
+
default=["PERSON", "ORG"])
|
1604 |
+
summary_length = st.select_slider("Summary Length",
|
1605 |
+
options=["Short", "Medium", "Long"],
|
1606 |
+
value="Medium")
|
1607 |
+
|
1608 |
+
with col3:
|
1609 |
+
st.subheader("Advanced")
|
1610 |
+
create_embeddings = st.checkbox("Generate Document Embeddings")
|
1611 |
+
semantic_search = st.checkbox("Enable Semantic Search")
|
1612 |
+
show_metadata = st.checkbox("Show Document Metadata", True)
|
1613 |
+
|
1614 |
+
# Security notice
|
1615 |
+
st.info("๐ Documents are processed in memory and never stored permanently")
|
1616 |
+
|
1617 |
+
if st.button("๐ Start Analysis", type="primary"):
|
1618 |
+
results = []
|
1619 |
+
with st.spinner("Analyzing documents...") and stqdm(uploaded_pdfs) as pbar:
|
1620 |
+
for pdf in pbar:
|
1621 |
+
try:
|
1622 |
+
# PDF Processing with error handling
|
1623 |
+
pdf_text = extract_text_from_pdf(pdf, ocr_enabled)
|
1624 |
+
|
1625 |
+
# Handle large documents with chunking
|
1626 |
+
chunks = [pdf_text[i:i+chunk_size]
|
1627 |
+
for i in range(0, len(pdf_text), chunk_size)]
|
1628 |
+
|
1629 |
+
doc_data = {
|
1630 |
+
"filename": pdf.name,
|
1631 |
+
"metadata": extract_metadata(pdf),
|
1632 |
+
"chunks": chunks,
|
1633 |
+
"content": pdf_text,
|
1634 |
+
"entities": [],
|
1635 |
+
"summary": "",
|
1636 |
+
"embeddings": None
|
1637 |
+
}
|
1638 |
+
|
1639 |
+
# Named Entity Recognition with filtering
|
1640 |
+
if ner_analysis:
|
1641 |
+
entities = perform_ner(pdf_text).query("Type in @ner_types")
|
1642 |
+
doc_data["entities"] = entities
|
1643 |
+
|
1644 |
+
# Generate entity visualization
|
1645 |
+
doc_data["entity_viz"] = visualize_entities(pdf_text)
|
1646 |
+
|
1647 |
+
# Adaptive summarization
|
1648 |
+
if len(pdf_text) > 1000:
|
1649 |
+
doc_data["summary"] = summarize_text(
|
1650 |
+
pdf_text,
|
1651 |
+
summary_length
|
1652 |
+
)
|
1653 |
+
else:
|
1654 |
+
doc_data["summary"] = "Text too short for summarization"
|
1655 |
+
|
1656 |
+
# Generate embeddings if enabled
|
1657 |
+
if create_embeddings:
|
1658 |
+
doc_data["embeddings"] = generate_embeddings(pdf_text)
|
1659 |
+
|
1660 |
+
results.append(doc_data)
|
1661 |
+
except Exception as e:
|
1662 |
+
st.error(f"Failed to process {pdf.name}: {str(e)}")
|
1663 |
+
|
1664 |
+
# Display Results in Interactive Dashboard
|
1665 |
+
st.subheader("Analysis Dashboard")
|
1666 |
+
tab1, tab2, tab3 = st.tabs(["Documents", "Entity Explorer", "Semantic Search"])
|
1667 |
+
|
1668 |
+
with tab1:
|
1669 |
+
for doc in results:
|
1670 |
+
with st.expander(f"๐ {doc['filename']}", expanded=False):
|
1671 |
+
col1, col2 = st.columns([2, 1])
|
1672 |
+
|
1673 |
+
with col1:
|
1674 |
+
st.subheader("Document Overview")
|
1675 |
+
|
1676 |
+
if show_metadata:
|
1677 |
+
st.markdown("**Metadata**")
|
1678 |
+
st.json(doc["metadata"])
|
1679 |
+
|
1680 |
+
st.markdown("**Key Summary**")
|
1681 |
+
st.write(doc["summary"])
|
1682 |
+
|
1683 |
+
st.markdown("**Text Preview**")
|
1684 |
+
st.text(doc["content"][:2000] + "...")
|
1685 |
+
|
1686 |
+
with col2:
|
1687 |
+
st.markdown("**Entity Analysis**")
|
1688 |
+
if not doc["entities"].empty:
|
1689 |
+
# Entity frequency chart
|
1690 |
+
fig = px.bar(doc["entities"],
|
1691 |
+
x="Count", y="Entity",
|
1692 |
+
color="Type", orientation='h')
|
1693 |
+
st.plotly_chart(fig, use_container_width=True)
|
1694 |
+
|
1695 |
+
# Interactive entity selector
|
1696 |
+
selected_entity = st.selectbox(
|
1697 |
+
"Explore Entity Context",
|
1698 |
+
doc["entities"]["Entity"].unique()
|
1699 |
+
)
|
1700 |
+
entity_context = get_entity_context(
|
1701 |
+
doc["content"], selected_entity)
|
1702 |
+
st.write(f"**{selected_entity} Context:**")
|
1703 |
+
st.caption(entity_context)
|
1704 |
+
|
1705 |
+
# Embedding download
|
1706 |
+
if create_embeddings:
|
1707 |
+
st.download_button(
|
1708 |
+
label="โฌ๏ธ Download Embeddings",
|
1709 |
+
data=pd.Series(doc["embeddings"]).to_csv(),
|
1710 |
+
file_name=f"{pdf.name}_embeddings.csv"
|
1711 |
+
)
|
1712 |
+
|
1713 |
+
with tab2:
|
1714 |
+
st.subheader("Entity Network Analysis")
|
1715 |
+
if results:
|
1716 |
+
all_entities = pd.concat([doc["entities"] for doc in results])
|
1717 |
+
create_entity_network(all_entities)
|
1718 |
+
|
1719 |
+
with tab3 if semantic_search else tab3:
|
1720 |
+
st.subheader("Semantic Search")
|
1721 |
+
search_query = st.text_input("Enter semantic search query")
|
1722 |
+
if search_query:
|
1723 |
+
results = perform_semantic_search(search_query, results)
|
1724 |
+
st.write("Most relevant documents:")
|
1725 |
+
for doc in results[:3]:
|
1726 |
+
st.write(f"๐ {doc['filename']} - Score: {doc['similarity']:.2f}")
|
1727 |
+
|
1728 |
+
# Enhanced Helper Functions
|
1729 |
+
def extract_text_from_pdf(pdf_file, use_ocr=False):
|
1730 |
+
"""Extract text with OCR support"""
|
1731 |
+
try:
|
1732 |
+
import pdfplumber
|
1733 |
+
with pdfplumber.open(pdf_file) as pdf:
|
1734 |
+
text = "\n".join([page.extract_text() for page in pdf.pages])
|
1735 |
+
|
1736 |
+
if use_ocr or len(text) < 50: # Fallback to OCR
|
1737 |
+
import fitz # PyMuPDF
|
1738 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
1739 |
+
text = ""
|
1740 |
+
for page in doc:
|
1741 |
+
text += page.get_text("text")
|
1742 |
+
if len(text) < 50:
|
1743 |
+
raise ValueError("Likely scanned document - enable OCR")
|
1744 |
+
return text
|
1745 |
+
except Exception as e:
|
1746 |
+
raise RuntimeError(f"Text extraction failed: {str(e)}")
|
1747 |
+
|
1748 |
+
def visualize_entities(text):
|
1749 |
+
"""Create interactive entity visualization"""
|
1750 |
+
import spacy
|
1751 |
+
from spacy import displacy
|
1752 |
+
nlp = spacy.load("en_core_web_sm")
|
1753 |
+
doc = nlp(text)
|
1754 |
+
html = displacy.render(doc, style="ent", page=True)
|
1755 |
+
return html
|
1756 |
+
|
1757 |
+
def generate_embeddings(text):
|
1758 |
+
"""Generate document embeddings"""
|
1759 |
+
from sentence_transformers import SentenceTransformer
|
1760 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
1761 |
+
return model.encode(text).tolist()
|
1762 |
+
|
1763 |
+
def extract_metadata(pdf_file):
|
1764 |
+
"""Extract PDF metadata"""
|
1765 |
+
import fitz
|
1766 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
1767 |
+
return {
|
1768 |
+
"author": doc.metadata.get("author"),
|
1769 |
+
"title": doc.metadata.get("title"),
|
1770 |
+
"pages": len(doc),
|
1771 |
+
"created": doc.metadata.get("creationDate"),
|
1772 |
+
"modified": doc.metadata.get("modDate")
|
1773 |
+
}
|
1774 |
+
|
1775 |
+
def perform_semantic_search(query, docs):
|
1776 |
+
"""Semantic search using embeddings"""
|
1777 |
+
from sentence_transformers import util
|
1778 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
1779 |
+
query_embedding = model.encode(query)
|
1780 |
+
|
1781 |
+
for doc in docs:
|
1782 |
+
doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
|
1783 |
+
|
1784 |
+
return sorted(docs, key=lambda x: x["similarity"], reverse=True)
|