import streamlit as st import re import fitz # PyMuPDF from pdfminer.high_level import extract_text from pdfminer.layout import LAParams import language_tool_python from typing import List, Dict, Any, Tuple, Optional from collections import Counter import json import traceback import io import tempfile import os import base64 from dataclasses import dataclass # Set JAVA_HOME environment variable os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' # ------------------------------ # Data Classes # ------------------------------ @dataclass class Highlight: page: int rect: Tuple[float, float, float, float] color: str message: str category: str @dataclass class AnalysisResult: highlights: List[Highlight] messages: List[Dict[str, Any]] summary: Dict[str, Any] # ------------------------------ # PDF Processing Functions # ------------------------------ def extract_pdf_text_by_page(file) -> List[str]: """Extracts text from a PDF file, page by page, using PyMuPDF.""" if isinstance(file, (str, bytes, io.BytesIO)): doc = fitz.open(stream=file.read() if hasattr(file, 'read') else file, filetype="pdf") text_by_page = [page.get_text("text") for page in doc] doc.close() return text_by_page return [] def extract_pdf_text(file) -> str: """Extracts text from a PDF file using pdfminer.""" if isinstance(file, (str, bytes, io.BytesIO)): return extract_text(file, laparams=LAParams()) return "" # ... (keep all your existing analysis functions) ... # ------------------------------ # Highlight Processing Functions # ------------------------------ def get_word_coordinates(doc: fitz.Document) -> Dict[int, List[Dict[str, Any]]]: """Extract word coordinates from each page of the PDF.""" word_coordinates = {} for page_num, page in enumerate(doc): words = page.get_text("words") word_coordinates[page_num] = [ { "text": word[4], "rect": fitz.Rect(word[:4]), "origin": word[5:], } for word in words ] return word_coordinates def find_text_location(text: str, word_coordinates: Dict[int, List[Dict[str, Any]]]) -> Optional[Highlight]: """Find the location of text in the PDF and return a Highlight object.""" text_lower = text.lower() for page_num, words in word_coordinates.items(): for i in range(len(words)): if words[i]["text"].lower() in text_lower: # Find the complete phrase rect = words[i]["rect"] j = i + 1 while j < len(words) and j - i < len(text.split()): rect = rect | words[j]["rect"] j += 1 return Highlight( page=page_num, rect=(rect.x0, rect.y0, rect.x1, rect.y1), color="yellow", message=text, category="text" ) return None # ------------------------------ # Streamlit Interface # ------------------------------ def create_sidebar(): """Create the sidebar with upload and analysis options.""" st.sidebar.title("PDF Analyzer") uploaded_file = st.sidebar.file_uploader("Upload PDF", type=['pdf']) analysis_options = st.sidebar.expander("Analysis Options", expanded=False) with analysis_options: options = { "check_language": st.checkbox("Language Analysis", value=True), "check_references": st.checkbox("Reference Analysis", value=True), "check_structure": st.checkbox("Structure Analysis", value=True), } return uploaded_file, options def display_pdf_viewer(pdf_bytes: bytes, highlights: List[Highlight]): """Display the PDF with highlights using a custom viewer.""" # Convert PDF bytes to base64 b64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') # Create custom HTML for PDF viewer html_content = f"""
{generate_highlight_overlays(highlights)}
""" st.components.v1.html(html_content, height=800) def generate_highlight_overlays(highlights: List[Highlight]) -> str: """Generate HTML for highlight overlays.""" overlay_html = "" for i, highlight in enumerate(highlights): overlay_html += f"""
""" return overlay_html def display_analysis_results(results: AnalysisResult): """Display analysis results in the sidebar.""" st.sidebar.markdown("## Analysis Results") # Display summary statistics st.sidebar.markdown("### Summary") for key, value in results.summary.items(): st.sidebar.metric(key, value) # Display messages grouped by category messages_by_category = {} for message in results.messages: category = message.get("category", "Other") if category not in messages_by_category: messages_by_category[category] = [] messages_by_category[category].append(message) for category, messages in messages_by_category.items(): with st.sidebar.expander(f"{category} ({len(messages)})"): for msg in messages: st.markdown(f"**{msg['title']}**") st.markdown(msg['description']) st.markdown("---") def main(): st.set_page_config( page_title="PDF Analyzer", page_icon="📄", layout="wide", initial_sidebar_state="expanded" ) # Create sidebar and get user input uploaded_file, options = create_sidebar() if uploaded_file is not None: # Read PDF file pdf_bytes = uploaded_file.read() # Analyze PDF try: results, annotated_pdf = analyze_pdf(io.BytesIO(pdf_bytes)) # Create two columns col1, col2 = st.columns([0.7, 0.3]) with col1: st.markdown("### Document Preview") # Display PDF with highlights if annotated_pdf: display_pdf_viewer(annotated_pdf, results.get("highlights", [])) else: display_pdf_viewer(pdf_bytes, []) with col2: st.markdown("