import streamlit as st import re import fitz # PyMuPDF from pdfminer.high_level import extract_text from pdfminer.layout import LAParams import language_tool_python from typing import List, Dict, Any, Tuple, Optional from collections import Counter import json import traceback import io import tempfile import os import base64 from dataclasses import dataclass # Set JAVA_HOME environment variable os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' # ------------------------------ # Data Classes # ------------------------------ @dataclass class Highlight: page: int rect: Tuple[float, float, float, float] color: str message: str category: str @dataclass class AnalysisResult: highlights: List[Highlight] messages: List[Dict[str, Any]] summary: Dict[str, Any] # ------------------------------ # PDF Processing Functions # ------------------------------ def extract_pdf_text_by_page(file) -> List[str]: """Extracts text from a PDF file, page by page, using PyMuPDF.""" if isinstance(file, (str, bytes, io.BytesIO)): doc = fitz.open(stream=file.read() if hasattr(file, 'read') else file, filetype="pdf") text_by_page = [page.get_text("text") for page in doc] doc.close() return text_by_page return [] def extract_pdf_text(file) -> str: """Extracts text from a PDF file using pdfminer.""" if isinstance(file, (str, bytes, io.BytesIO)): return extract_text(file, laparams=LAParams()) return "" # ... (keep all your existing analysis functions) ... # ------------------------------ # Highlight Processing Functions # ------------------------------ def get_word_coordinates(doc: fitz.Document) -> Dict[int, List[Dict[str, Any]]]: """Extract word coordinates from each page of the PDF.""" word_coordinates = {} for page_num, page in enumerate(doc): words = page.get_text("words") word_coordinates[page_num] = [ { "text": word[4], "rect": fitz.Rect(word[:4]), "origin": word[5:], } for word in words ] return word_coordinates def find_text_location(text: str, word_coordinates: Dict[int, List[Dict[str, Any]]]) -> Optional[Highlight]: """Find the location of text in the PDF and return a Highlight object.""" text_lower = text.lower() for page_num, words in word_coordinates.items(): for i in range(len(words)): if words[i]["text"].lower() in text_lower: # Find the complete phrase rect = words[i]["rect"] j = i + 1 while j < len(words) and j - i < len(text.split()): rect = rect | words[j]["rect"] j += 1 return Highlight( page=page_num, rect=(rect.x0, rect.y0, rect.x1, rect.y1), color="yellow", message=text, category="text" ) return None # ------------------------------ # Streamlit Interface # ------------------------------ def create_sidebar(): """Create the sidebar with upload and analysis options.""" st.sidebar.title("PDF Analyzer") uploaded_file = st.sidebar.file_uploader("Upload PDF", type=['pdf']) analysis_options = st.sidebar.expander("Analysis Options", expanded=False) with analysis_options: options = { "check_language": st.checkbox("Language Analysis", value=True), "check_references": st.checkbox("Reference Analysis", value=True), "check_structure": st.checkbox("Structure Analysis", value=True), } return uploaded_file, options def display_pdf_viewer(pdf_bytes: bytes, highlights: List[Highlight]): """Display the PDF with highlights using a custom viewer.""" # Convert PDF bytes to base64 b64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') # Create custom HTML for PDF viewer html_content = f"""