Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 3

Commit

bef6efa

verified ·

1 Parent(s): 022a14a

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -3

app.py CHANGED Viewed

@@ -329,9 +329,10 @@ def prediction_input_form(features, default_values=None):
 with st.sidebar:
     st.title("🔮 DataInsight Pro")
     app_mode = st.selectbox(
-        "Navigation",
-        ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
-        format_func=lambda x: f"📌 {x}"
     )
     st.markdown("---")
     st.markdown("Created by Calvin Allen-Crawford")
@@ -1571,3 +1572,213 @@ elif app_mode == "Predictions":
         except Exception as e:
             st.error(f"Prediction failed: {str(e)}")

 with st.sidebar:
     st.title("🔮 DataInsight Pro")
     app_mode = st.selectbox(
+    "Navigation",
+    ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions", "PDF Analysis"],
+    format_func=lambda x: f"📌 {x}"
+)
     )
     st.markdown("---")
     st.markdown("Created by Calvin Allen-Crawford")
         except Exception as e:
             st.error(f"Prediction failed: {str(e)}")
+elif app_mode == "PDF Analysis":
+    st.title("📄 Advanced PDF Analyzer")
+    # PDF Upload with drag & drop zone
+    with st.container(border=True):
+        uploaded_pdfs = st.file_uploader("Drag & Drop PDF Files",
+                                       type="pdf",
+                                       accept_multiple_files=True,
+                                       help="Upload multiple PDF documents for analysis")
+    if uploaded_pdfs:
+        # Enhanced processing options
+        with st.expander("⚙️ Analysis Configuration", expanded=True):
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.subheader("Text Options")
+                extract_mode = st.radio("Extraction Mode", ["Full Text", "Key Sections"])
+                ocr_enabled = st.checkbox("Enable OCR (for scanned PDFs)", False)
+                chunk_size = st.slider("Chunk Size (characters)", 500, 5000, 2000)
+            with col2:
+                st.subheader("NLP Features")
+                ner_analysis = st.checkbox("Named Entity Recognition", True)
+                ner_types = st.multiselect("Entity Types to Show",
+                                          ["PERSON", "ORG", "GPE", "DATE", "MONEY"],
+                                          default=["PERSON", "ORG"])
+                summary_length = st.select_slider("Summary Length",
+                                                 options=["Short", "Medium", "Long"],
+                                                 value="Medium")
+            with col3:
+                st.subheader("Advanced")
+                create_embeddings = st.checkbox("Generate Document Embeddings")
+                semantic_search = st.checkbox("Enable Semantic Search")
+                show_metadata = st.checkbox("Show Document Metadata", True)
+        # Security notice
+        st.info("🔒 Documents are processed in memory and never stored permanently")
+        if st.button("🚀 Start Analysis", type="primary"):
+            results = []
+            with st.spinner("Analyzing documents...") and stqdm(uploaded_pdfs) as pbar:
+                for pdf in pbar:
+                    try:
+                        # PDF Processing with error handling
+                        pdf_text = extract_text_from_pdf(pdf, ocr_enabled)
+                        # Handle large documents with chunking
+                        chunks = [pdf_text[i:i+chunk_size]
+                                 for i in range(0, len(pdf_text), chunk_size)]
+                        doc_data = {
+                            "filename": pdf.name,
+                            "metadata": extract_metadata(pdf),
+                            "chunks": chunks,
+                            "content": pdf_text,
+                            "entities": [],
+                            "summary": "",
+                            "embeddings": None
+                        }
+                        # Named Entity Recognition with filtering
+                        if ner_analysis:
+                            entities = perform_ner(pdf_text).query("Type in @ner_types")
+                            doc_data["entities"] = entities
+                            # Generate entity visualization
+                            doc_data["entity_viz"] = visualize_entities(pdf_text)
+                        # Adaptive summarization
+                        if len(pdf_text) > 1000:
+                            doc_data["summary"] = summarize_text(
+                                pdf_text,
+                                summary_length
+                            )
+                        else:
+                            doc_data["summary"] = "Text too short for summarization"
+                        # Generate embeddings if enabled
+                        if create_embeddings:
+                            doc_data["embeddings"] = generate_embeddings(pdf_text)
+                        results.append(doc_data)
+                    except Exception as e:
+                        st.error(f"Failed to process {pdf.name}: {str(e)}")
+            # Display Results in Interactive Dashboard
+            st.subheader("Analysis Dashboard")
+            tab1, tab2, tab3 = st.tabs(["Documents", "Entity Explorer", "Semantic Search"])
+            with tab1:
+                for doc in results:
+                    with st.expander(f"📑 {doc['filename']}", expanded=False):
+                        col1, col2 = st.columns([2, 1])
+                        with col1:
+                            st.subheader("Document Overview")
+                            if show_metadata:
+                                st.markdown("**Metadata**")
+                                st.json(doc["metadata"])
+                            st.markdown("**Key Summary**")
+                            st.write(doc["summary"])
+                            st.markdown("**Text Preview**")
+                            st.text(doc["content"][:2000] + "...")
+                        with col2:
+                            st.markdown("**Entity Analysis**")
+                            if not doc["entities"].empty:
+                                # Entity frequency chart
+                                fig = px.bar(doc["entities"],
+                                           x="Count", y="Entity",
+                                           color="Type", orientation='h')
+                                st.plotly_chart(fig, use_container_width=True)
+                                # Interactive entity selector
+                                selected_entity = st.selectbox(
+                                    "Explore Entity Context",
+                                    doc["entities"]["Entity"].unique()
+                                )
+                                entity_context = get_entity_context(
+                                    doc["content"], selected_entity)
+                                st.write(f"**{selected_entity} Context:**")
+                                st.caption(entity_context)
+                            # Embedding download
+                            if create_embeddings:
+                                st.download_button(
+                                    label="⬇️ Download Embeddings",
+                                    data=pd.Series(doc["embeddings"]).to_csv(),
+                                    file_name=f"{pdf.name}_embeddings.csv"
+                                )
+            with tab2:
+                st.subheader("Entity Network Analysis")
+                if results:
+                    all_entities = pd.concat([doc["entities"] for doc in results])
+                    create_entity_network(all_entities)
+            with tab3 if semantic_search else tab3:
+                st.subheader("Semantic Search")
+                search_query = st.text_input("Enter semantic search query")
+                if search_query:
+                    results = perform_semantic_search(search_query, results)
+                    st.write("Most relevant documents:")
+                    for doc in results[:3]:
+                        st.write(f"📄 {doc['filename']} - Score: {doc['similarity']:.2f}")
+# Enhanced Helper Functions
+def extract_text_from_pdf(pdf_file, use_ocr=False):
+    """Extract text with OCR support"""
+    try:
+        import pdfplumber
+        with pdfplumber.open(pdf_file) as pdf:
+            text = "\n".join([page.extract_text() for page in pdf.pages])
+            if use_ocr or len(text) < 50:  # Fallback to OCR
+                import fitz  # PyMuPDF
+                doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+                text = ""
+                for page in doc:
+                    text += page.get_text("text")
+                if len(text) < 50:
+                    raise ValueError("Likely scanned document - enable OCR")
+        return text
+    except Exception as e:
+        raise RuntimeError(f"Text extraction failed: {str(e)}")
+def visualize_entities(text):
+    """Create interactive entity visualization"""
+    import spacy
+    from spacy import displacy
+    nlp = spacy.load("en_core_web_sm")
+    doc = nlp(text)
+    html = displacy.render(doc, style="ent", page=True)
+    return html
+def generate_embeddings(text):
+    """Generate document embeddings"""
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    return model.encode(text).tolist()
+def extract_metadata(pdf_file):
+    """Extract PDF metadata"""
+    import fitz
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    return {
+        "author": doc.metadata.get("author"),
+        "title": doc.metadata.get("title"),
+        "pages": len(doc),
+        "created": doc.metadata.get("creationDate"),
+        "modified": doc.metadata.get("modDate")
+    }
+def perform_semantic_search(query, docs):
+    """Semantic search using embeddings"""
+    from sentence_transformers import util
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    query_embedding = model.encode(query)
+    for doc in docs:
+        doc["similarity"] = util.cos_sim(query_embedding, doc["embeddings"]).mean()
+    return sorted(docs, key=lambda x: x["similarity"], reverse=True)