import streamlit as st
import re
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import language_tool_python
from typing import List, Dict, Any, Tuple, Optional
from collections import Counter
import json
import traceback
import io
import tempfile
import os
import base64
from dataclasses import dataclass

# Set JAVA_HOME environment variable
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'

# ------------------------------
# Data Classes
# ------------------------------

@dataclass
class Highlight:
    page: int
    rect: Tuple[float, float, float, float]
    color: str
    message: str
    category: str

@dataclass
class AnalysisResult:
    highlights: List[Highlight]
    messages: List[Dict[str, Any]]
    summary: Dict[str, Any]

# ------------------------------
# PDF Processing Functions
# ------------------------------

def extract_pdf_text_by_page(file) -> List[str]:
    """Extracts text from a PDF file, page by page, using PyMuPDF."""
    if isinstance(file, (str, bytes, io.BytesIO)):
        doc = fitz.open(stream=file.read() if hasattr(file, 'read') else file, filetype="pdf")
        text_by_page = [page.get_text("text") for page in doc]
        doc.close()
        return text_by_page
    return []

def extract_pdf_text(file) -> str:
    """Extracts text from a PDF file using pdfminer."""
    if isinstance(file, (str, bytes, io.BytesIO)):
        return extract_text(file, laparams=LAParams())
    return ""

# ... (keep all your existing analysis functions) ...

# ------------------------------
# Highlight Processing Functions
# ------------------------------

def get_word_coordinates(doc: fitz.Document) -> Dict[int, List[Dict[str, Any]]]:
    """Extract word coordinates from each page of the PDF."""
    word_coordinates = {}
    for page_num, page in enumerate(doc):
        words = page.get_text("words")
        word_coordinates[page_num] = [
            {
                "text": word[4],
                "rect": fitz.Rect(word[:4]),
                "origin": word[5:],
            }
            for word in words
        ]
    return word_coordinates

def find_text_location(text: str, word_coordinates: Dict[int, List[Dict[str, Any]]]) -> Optional[Highlight]:
    """Find the location of text in the PDF and return a Highlight object."""
    text_lower = text.lower()
    for page_num, words in word_coordinates.items():
        for i in range(len(words)):
            if words[i]["text"].lower() in text_lower:
                # Find the complete phrase
                rect = words[i]["rect"]
                j = i + 1
                while j < len(words) and j - i < len(text.split()):
                    rect = rect | words[j]["rect"]
                    j += 1
                
                return Highlight(
                    page=page_num,
                    rect=(rect.x0, rect.y0, rect.x1, rect.y1),
                    color="yellow",
                    message=text,
                    category="text"
                )
    return None

# ------------------------------
# Streamlit Interface
# ------------------------------

def create_sidebar():
    """Create the sidebar with upload and analysis options."""
    st.sidebar.title("PDF Analyzer")
    uploaded_file = st.sidebar.file_uploader("Upload PDF", type=['pdf'])
    
    analysis_options = st.sidebar.expander("Analysis Options", expanded=False)
    with analysis_options:
        options = {
            "check_language": st.checkbox("Language Analysis", value=True),
            "check_references": st.checkbox("Reference Analysis", value=True),
            "check_structure": st.checkbox("Structure Analysis", value=True),
        }
    
    return uploaded_file, options

def display_pdf_viewer(pdf_bytes: bytes, highlights: List[Highlight]):
    """Display the PDF with highlights using a custom viewer."""
    # Convert PDF bytes to base64
    b64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
    
    # Create custom HTML for PDF viewer
    html_content = f"""
        <div style="position: relative; width: 100%; height: 800px;">
            <iframe src="data:application/pdf;base64,{b64_pdf}"
                    width="100%"
                    height="100%"
                    style="border: none;">
            </iframe>
            <div id="highlight-container">
                {generate_highlight_overlays(highlights)}
            </div>
        </div>
        <style>
            .highlight {{
                position: absolute;
                opacity: 0.3;
                pointer-events: all;
                cursor: pointer;
                transition: opacity 0.2s;
            }}
            .highlight:hover {{
                opacity: 0.5;
            }}
        </style>
    """
    
    st.components.v1.html(html_content, height=800)

def generate_highlight_overlays(highlights: List[Highlight]) -> str:
    """Generate HTML for highlight overlays."""
    overlay_html = ""
    for i, highlight in enumerate(highlights):
        overlay_html += f"""
            <div class="highlight"
                 style="left: {highlight.rect[0]}px;
                        top: {highlight.rect[1]}px;
                        width: {highlight.rect[2] - highlight.rect[0]}px;
                        height: {highlight.rect[3] - highlight.rect[1]}px;
                        background-color: {highlight.color};"
                 onclick="showMessage({i})"
                 title="{highlight.message}">
            </div>
        """
    return overlay_html

def display_analysis_results(results: AnalysisResult):
    """Display analysis results in the sidebar."""
    st.sidebar.markdown("## Analysis Results")
    
    # Display summary statistics
    st.sidebar.markdown("### Summary")
    for key, value in results.summary.items():
        st.sidebar.metric(key, value)
    
    # Display messages grouped by category
    messages_by_category = {}
    for message in results.messages:
        category = message.get("category", "Other")
        if category not in messages_by_category:
            messages_by_category[category] = []
        messages_by_category[category].append(message)
    
    for category, messages in messages_by_category.items():
        with st.sidebar.expander(f"{category} ({len(messages)})"):
            for msg in messages:
                st.markdown(f"**{msg['title']}**")
                st.markdown(msg['description'])
                st.markdown("---")

def main():
    st.set_page_config(
        page_title="PDF Analyzer",
        page_icon="📄",
        layout="wide",
        initial_sidebar_state="expanded"
    )
    
    # Create sidebar and get user input
    uploaded_file, options = create_sidebar()
    
    if uploaded_file is not None:
        # Read PDF file
        pdf_bytes = uploaded_file.read()
        
        # Analyze PDF
        try:
            results, annotated_pdf = analyze_pdf(io.BytesIO(pdf_bytes))
            
            # Create two columns
            col1, col2 = st.columns([0.7, 0.3])
            
            with col1:
                st.markdown("### Document Preview")
                # Display PDF with highlights
                if annotated_pdf:
                    display_pdf_viewer(annotated_pdf, results.get("highlights", []))
                else:
                    display_pdf_viewer(pdf_bytes, [])
            
            with col2:
                st.markdown("