textmetric-stramlit-1

Sleeping

App Files Files Community

samyak152002 commited on Nov 3, 2024

Commit

99dc100

verified ·

1 Parent(s): 40e8eb9

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -278

app.py CHANGED Viewed

@@ -1,299 +1,66 @@
 import streamlit as st
-import re
-import fitz  # PyMuPDF
-from pdfminer.high_level import extract_text
-from pdfminer.layout import LAParams
-import language_tool_python
-from typing import List, Dict, Any, Tuple
-from collections import Counter
-import json
-import traceback
-import io
 import tempfile
 import os
-import base64
-# Set JAVA_HOME environment variable
-os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
-# ------------------------------
-# Analysis Functions
-# ------------------------------
-def extract_pdf_text_by_page(file) -> List[str]:
-    """Extracts text from a PDF file, page by page, using PyMuPDF."""
-    if isinstance(file, str):
-        with fitz.open(file) as doc:
-            return [page.get_text("text") for page in doc]
-    else:
-        with fitz.open(stream=file.read(), filetype="pdf") as doc:
-            return [page.get_text("text") for page in doc]
-def extract_pdf_text(file) -> str:
-    """Extracts text from a PDF file using pdfminer."""
-    if isinstance(file, str):
-        with open(file, 'rb') as f:
-            return extract_text(f, laparams=LAParams())
-    else:
-        return extract_text(file, laparams=LAParams())
-def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
-    """Checks for the presence of required terms in the text."""
-    return {term: term.lower() in full_text.lower() for term in search_terms}
-def check_metadata(full_text: str) -> Dict[str, Any]:
-    """Check for metadata elements."""
-    return {
-        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
-        "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
-        "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
-        "word_count": len(full_text.split())
-    }
-def check_language_issues(full_text: str) -> Dict[str, Any]:
-    """Check for language issues."""
-    try:
-        language_tool = language_tool_python.LanguageTool('en-US')
-        matches = language_tool.check(full_text)
-        issues = []
-        for match in matches:
-            issues.append({
-                "message": match.message,
-                "context": match.context,
-                "suggestions": match.replacements[:3] if match.replacements else [],
-                "category": match.category,
-                "rule_id": match.ruleId
-            })
-        return {
-            "total_issues": len(issues),
-            "issues": issues
-        }
-    except Exception as e:
-        return {
-            "total_issues": 0,
-            "issues": [],
-            "error": str(e)
-        }
-def analyze_pdf(file) -> Dict[str, Any]:
-    """Main analysis function."""
-    try:
-        # Extract text
-        full_text = extract_pdf_text(file)
-        # Perform analysis
-        results = {
-            "metadata": check_metadata(full_text),
-            "language": {
-                "issues": check_language_issues(full_text)
-            },
-            "structure": {
-                "has_abstract": bool(re.search(r'\bAbstract\b', full_text, re.IGNORECASE)),
-                "has_introduction": bool(re.search(r'\bIntroduction\b', full_text, re.IGNORECASE)),
-                "has_conclusion": bool(re.search(r'\bConclusion\b', full_text, re.IGNORECASE))
-            }
-        }
-        return results
-    except Exception as e:
-        return {"error": str(e), "traceback": traceback.format_exc()}
-# ------------------------------
-# PDF Display Functions
-# ------------------------------
 def display_pdf(pdf_bytes):
-    """Display PDF in Streamlit."""
-    base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
-    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800" type="application/pdf"></iframe>'
-    st.markdown(pdf_display, unsafe_allow_html=True)
-def get_pdf_display_html(pdf_bytes):
-    """Generate HTML for PDF display with highlight container."""
-    base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
-    return f"""
-        <div style="position: relative; width: 100%; height: 800px;">
-            <iframe src="data:application/pdf;base64,{base64_pdf}"
-                    width="100%"
-                    height="100%"
-                    style="border: none;">
-            </iframe>
-            <div id="highlight-container"></div>
-        </div>
-    """
-# ------------------------------
-# Streamlit Interface Functions
-# ------------------------------
-def render_sidebar():
-    """Render the sidebar with analysis options."""
-    st.sidebar.title("PDF Analysis Options")
-    options = {
-        "check_language": st.sidebar.checkbox("Check Language", value=True),
-        "check_structure": st.sidebar.checkbox("Check Structure", value=True),
-        "check_metadata": st.sidebar.checkbox("Check Metadata", value=True)
-    }
-    return options
-def display_analysis_results(results: Dict[str, Any]):
-    """Display analysis results in an organized manner."""
-    st.sidebar.markdown("## Analysis Results")
-    # Display metadata results
-    if "metadata" in results:
-        with st.sidebar.expander("📋 Metadata Analysis", expanded=True):
-            metadata = results["metadata"]
-            st.markdown(f"**Word Count:** {metadata['word_count']}")
-            st.markdown(f"**Has Author List:** {'✅' if metadata['list_of_authors'] else '❌'}")
-            st.markdown(f"**Has Keywords:** {'✅' if metadata['keywords_list'] else '❌'}")
-    # Display language issues
-    if "language" in results and "issues" in results["language"]:
-        with st.sidebar.expander("🔤 Language Issues", expanded=True):
-            issues = results["language"]["issues"]
-            st.markdown(f"**Total Issues Found:** {issues['total_issues']}")
-            if issues['total_issues'] > 0:
-                for idx, issue in enumerate(issues['issues'], 1):
-                    st.markdown(f"""
-                    **Issue {idx}:**
-                    - Type: {issue['category']}
-                    - Message: {issue['message']}
-                    - Context: {issue['context']}
-                    - Suggestions: {', '.join(issue['suggestions']) if issue['suggestions'] else 'None'}
-                    ---
-                    """)
-    # Display structure analysis
-    if "structure" in results:
-        with st.sidebar.expander("🏗️ Structure Analysis", expanded=True):
-            structure = results["structure"]
-            st.markdown(f"**Has Abstract:** {'✅' if structure['has_abstract'] else '❌'}")
-            st.markdown(f"**Has Introduction:** {'✅' if structure['has_introduction'] else '❌'}")
-            st.markdown(f"**Has Conclusion:** {'✅' if structure['has_conclusion'] else '❌'}")
-# ------------------------------
-# Main Application
-# ------------------------------
 def main():
     st.set_page_config(
         page_title="PDF Analyzer",
         page_icon="📄",
         layout="wide",
-        initial_sidebar_state="expanded"
     )
-    # Main title
-    st.title("PDF Document Analyzer")
     st.markdown("""
-    Upload a PDF document to analyze its structure, language, and metadata.
-    The analysis results will appear in the sidebar, and any issues found will be highlighted in the document.
     """)
-    # Get analysis options from sidebar
-    options = render_sidebar()
-    # File uploader
-    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
     if uploaded_file is not None:
-        try:
-            # Read PDF file
-            pdf_bytes = uploaded_file.read()
-            # Create two columns for layout
-            col1, col2 = st.columns([0.7, 0.3])
-            with col1:
-                st.markdown("### Document Preview")
-                # Display PDF
-                display_pdf(pdf_bytes)
-            with col2:
-                st.markdown("### Analysis Progress")
-                # Show progress bar while analyzing
-                with st.spinner("Analyzing PDF..."):
-                    # Analyze PDF
-                    results = analyze_pdf(io.BytesIO(pdf_bytes))
-                    if "error" in results:
-                        st.error("Error during analysis:")
-                        st.code(results["error"])
-                        if "traceback" in results:
-                            with st.expander("Show error details"):
-                                st.code(results["traceback"])
-                    else:
-                        st.success("Analysis complete!")
-                        # Display summary metrics
-                        col2_1, col2_2 = st.columns(2)
-                        with col2_1:
-                            st.metric(
-                                "Language Issues",
-                                results.get("language", {}).get("issues", {}).get("total_issues", 0)
-                            )
-                        with col2_2:
-                            st.metric(
-                                "Word Count",
-                                results.get("metadata", {}).get("word_count", 0)
-                            )
-                        # Display detailed results in sidebar
-                        display_analysis_results(results)
-        except Exception as e:
-            st.error(f"An error occurred: {str(e)}")
-            st.code(traceback.format_exc())
-    else:
-        # Show instructions when no file is uploaded
-        st.markdown("""
-        ### Instructions
-        1. Use the sidebar to select which aspects of the document you want to analyze
-        2. Upload a PDF file using the file uploader above
-        3. View the analysis results in the sidebar
-        4. Issues found will be highlighted in the document preview
-        ### Features
-        - **Language Analysis**: Checks for grammar, style, and clarity issues
-        - **Structure Analysis**: Verifies the presence of key document sections
-        - **Metadata Analysis**: Examines document metadata and formatting
-        """)
-# ------------------------------
-# CSS Styles
-# ------------------------------
-def load_css():
-    """Load custom CSS styles."""
-    st.markdown("""
-        <style>
-        .highlight {
-            background-color: yellow;
-            opacity: 0.3;
-            position: absolute;
-            pointer-events: none;
-        }
-        .stButton>button {
-            width: 100%;
-        }
-        .sidebar .sidebar-content {
-            width: 100%;
-        }
-        </style>
-    """, unsafe_allow_html=True)
-# ------------------------------
-# Run Application
-# ------------------------------
 if __name__ == "__main__":
-    load_css()
     main()

+# app.py
 import streamlit as st
+import base64
+from annotations import analyze_pdf
 import tempfile
 import os
 def display_pdf(pdf_bytes):
+    """Displays the PDF in the browser using an iframe."""
+    if pdf_bytes:
+        base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
+        pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800px" type="application/pdf"></iframe>'
+        st.markdown(pdf_display, unsafe_allow_html=True)
+    else:
+        st.info("No annotated PDF to display.")
 def main():
     st.set_page_config(
         page_title="PDF Analyzer",
         page_icon="📄",
         layout="wide",
     )
+    st.title("📄 PDF Analyzer")
     st.markdown("""
+    Upload a PDF to analyze its language, highlight errors, and view detailed error reports.
     """)
+    uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
     if uploaded_file is not None:
+        with st.spinner("Analyzing PDF..."):
+            language_results, annotated_pdf = analyze_pdf(uploaded_file)
+        if "error" in language_results:
+            st.error("An error occurred during analysis:")
+            st.code(language_results["error"])
+        else:
+            st.success("Analysis complete!")
+            # Display the annotated PDF
+            st.subheader("📄 Annotated PDF")
+            display_pdf(annotated_pdf)
+            # Sidebar for error details
+            st.sidebar.header("📝 Error Details")
+            if language_results.get("total_issues", 0) > 0:
+                for idx, issue in enumerate(language_results["issues"], 1):
+                    with st.sidebar.expander(f"Issue {idx}"):
+                        st.markdown(f"**Message:** {issue['message']}")
+                        st.markdown(f"**Category:** {issue['category']}")
+                        st.markdown(f"**Suggestions:** {', '.join(issue['suggestions']) if issue['suggestions'] else 'No suggestions'}")
+                        st.markdown(f"**Sentence:** {issue['context']}")
+            else:
+                st.sidebar.success("No language issues found!")
+            # Option to download the annotated PDF
+            if annotated_pdf:
+                b64 = base64.b64encode(annotated_pdf).decode()
+                href = f'<a href="data:application/pdf;base64,{b64}" download="annotated.pdf">📥 Download Annotated PDF</a>'
+                st.markdown(href, unsafe_allow_html=True)
 if __name__ == "__main__":
     main()