Spaces:

NaimaAqeel
/

document-similarity-checker

Running

App Files Files Community

NaimaAqeel commited on 4 days ago

Commit

9ddc9f6

verified ·

1 Parent(s): a2ac660

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -52

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import fitz  # PyMuPDF for PDF extraction
 import docx  # python-docx for DOCX extraction
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
@@ -8,20 +7,47 @@ from typing import List, Tuple, Dict
 import matplotlib.pyplot as plt
 import numpy as np
 from collections import defaultdict
 # Initialize the SentenceTransformer model
 model = SentenceTransformer('all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_path):
     try:
-        doc = fitz.open(pdf_path)
         text = ""
         for page in doc:
             text += page.get_text()
         return text
     except Exception as e:
         print(f"Error extracting text from PDF: {str(e)}")
-        return ""
 def extract_text_from_docx(docx_path):
     try:
@@ -30,10 +56,13 @@ def extract_text_from_docx(docx_path):
         return text
     except Exception as e:
         print(f"Error extracting text from DOCX: {str(e)}")
-        return ""
 def preprocess_text(text: str) -> List[str]:
     """Split text into sentences and clean them"""
     # Split into sentences using regex
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
     # Clean sentences
@@ -46,6 +75,9 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     sentences1 = preprocess_text(doc1)
     sentences2 = preprocess_text(doc2)
     # Get embeddings for all sentences
     embeddings1 = model.encode(sentences1, convert_to_tensor=True)
     embeddings2 = model.encode(sentences2, convert_to_tensor=True)
@@ -70,33 +102,40 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
             similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
     # Calculate overall similarity
-    if len(sentences1) > 0 and len(sentences2) > 0:
-        # Use max similarity for each sentence and average
-        max_similarities1 = cosine_similarities.max(dim=1)[0]
-        max_similarities2 = cosine_similarities.max(dim=0)[0]
-        mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
-        overall_similarity = mean_similarity.item()
-    else:
-        overall_similarity = 0.0
     return overall_similarity, similar_pairs
-def visualize_similarity(sentences1, sentences2, similarity_matrix):
-    """Create a heatmap visualization of sentence similarities"""
     plt.figure(figsize=(10, 8))
-    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
     plt.colorbar(label='Similarity Score')
     plt.xlabel('Document 2 Sentences')
     plt.ylabel('Document 1 Sentences')
     plt.title('Sentence Similarity Heatmap')
     plt.tight_layout()
-    plt.savefig('similarity_heatmap.png')
     plt.close()
-    return 'similarity_heatmap.png'
 def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
     """Group similar sentences by concept using keyword extraction"""
-    # Simple keyword-based grouping (could be enhanced with NLP techniques)
     concept_groups = defaultdict(list)
     # Define some common concepts for SOPs
@@ -120,9 +159,35 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
     return concept_groups
 def similarity(file1, file2):
     # Extract text based on file type
-    text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
-    text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
     # Calculate similarity and get similar pairs
     overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
@@ -141,63 +206,68 @@ def similarity(file1, file2):
                 output_html += f"<h5>{concept.capitalize()}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
                     output_html += f"""
-                    <div style="background-color: #f0f0f0; padding: 10px; margin: 5px; border-radius: 5px;">
                         <p><b>Document 1:</b> {sent1}</p>
                         <p><b>Document 2:</b> {sent2}</p>
                         <p><b>Similarity:</b> {score:.2%}</p>
                     </div>
                     """
     else:
-        output_html += "<p>No significant similarities found above the threshold.</p>"
     # Generate similarity heatmap if there are sentences
     sentences1 = preprocess_text(text1)
     sentences2 = preprocess_text(text2)
     if sentences1 and sentences2:
         # Get embeddings for visualization
         embeddings1 = model.encode(sentences1, convert_to_tensor=True)
         embeddings2 = model.encode(sentences2, convert_to_tensor=True)
         similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
-        # Generate and save heatmap
-        heatmap_path = visualize_similarity(sentences1, sentences2, similarity_matrix)
-        output_html += f'<h4>Similarity Heatmap:</h4><img src="/file={heatmap_path}" alt="Similarity Heatmap" style="max-width: 100%;">'
-    return output_html
-# Create a Gradio interface with enhanced features
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # Document Similarity Checker with Detailed Analysis
-    Upload two documents to compare their content and identify specific similarities.
     """)
     with gr.Row():
-        with gr.Column():
-            file1 = gr.File(label="Upload Document 1", file_types=[".pdf", ".docx"])
-            file2 = gr.File(label="Upload Document 2", file_types=[".pdf", ".docx"])
-            submit = gr.Button("Compare Documents", variant="primary")
-        with gr.Column():
-            output = gr.HTML(label="Similarity Analysis Results")
-    # Add examples for users to try
-    gr.Examples(
-        examples=[
-            [os.path.join(os.path.dirname(__file__), "sample1.pdf"), os.path.join(os.path.dirname(__file__), "sample2.pdf")],
-            [os.path.join(os.path.dirname(__file__), "sample1.docx"), os.path.join(os.path.dirname(__file__), "sample2.docx")]
-        ],
         inputs=[file1, file2],
-        outputs=output,
-        fn=similarity,
-        cache_examples=False
     )
-    submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
-# Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
-port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=port)

 import os
 import docx  # python-docx for DOCX extraction
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 from collections import defaultdict
+import base64
+from io import BytesIO
+# Try to import PyMuPDF with proper error handling
+pymupdf_available = False
+try:
+    # Try importing PyMuPDF directly (the correct package)
+    import pymupdf
+    pymupdf_available = True
+    print("PyMuPDF imported successfully")
+except ImportError:
+    try:
+        # Try the older import style
+        import fitz
+        pymupdf_available = True
+        print("fitz imported successfully")
+    except ImportError:
+        print("PyMuPDF/fitz is not available. PDF extraction will not work.")
 # Initialize the SentenceTransformer model
 model = SentenceTransformer('all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_path):
+    if not pymupdf_available:
+        return "PDF processing not available. Please install PyMuPDF."
     try:
+        # Use the correct import based on what's available
+        if 'pymupdf' in globals():
+            doc = pymupdf.open(pdf_path)
+        else:
+            import fitz
+            doc = fitz.open(pdf_path)
         text = ""
         for page in doc:
             text += page.get_text()
         return text
     except Exception as e:
         print(f"Error extracting text from PDF: {str(e)}")
+        return f"Error extracting PDF: {str(e)}"
 def extract_text_from_docx(docx_path):
     try:
         return text
     except Exception as e:
         print(f"Error extracting text from DOCX: {str(e)}")
+        return f"Error extracting DOCX: {str(e)}"
 def preprocess_text(text: str) -> List[str]:
     """Split text into sentences and clean them"""
+    if not text or text.strip() == "":
+        return []
     # Split into sentences using regex
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
     # Clean sentences
     sentences1 = preprocess_text(doc1)
     sentences2 = preprocess_text(doc2)
+    if not sentences1 or not sentences2:
+        return 0.0, []
     # Get embeddings for all sentences
     embeddings1 = model.encode(sentences1, convert_to_tensor=True)
     embeddings2 = model.encode(sentences2, convert_to_tensor=True)
             similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
     # Calculate overall similarity
+    max_similarities1 = cosine_similarities.max(dim=1)[0]
+    max_similarities2 = cosine_similarities.max(dim=0)[0]
+    mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
+    overall_similarity = mean_similarity.item()
     return overall_similarity, similar_pairs
+def create_heatmap_image(sentences1, sentences2, similarity_matrix):
+    """Create a heatmap visualization of sentence similarities and return as base64"""
+    if len(sentences1) == 0 or len(sentences2) == 0:
+        return None
+    # Create figure
     plt.figure(figsize=(10, 8))
+    plt.imshow(similarity_matrix, cmap='viridis', interpolation='nearest')
     plt.colorbar(label='Similarity Score')
     plt.xlabel('Document 2 Sentences')
     plt.ylabel('Document 1 Sentences')
     plt.title('Sentence Similarity Heatmap')
     plt.tight_layout()
+    # Save to buffer
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     plt.close()
+    buf.seek(0)
+    # Convert to base64
+    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
+    return f"data:image/png;base64,{img_base64}"
 def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
     """Group similar sentences by concept using keyword extraction"""
+    # Simple keyword-based grouping
     concept_groups = defaultdict(list)
     # Define some common concepts for SOPs
     return concept_groups
 def similarity(file1, file2):
+    if file1 is None or file2 is None:
+        return "Please upload both documents.", None
     # Extract text based on file type
+    try:
+        if file1.name.endswith('.pdf'):
+            text1 = extract_text_from_pdf(file1.name)
+        elif file1.name.endswith('.docx'):
+            text1 = extract_text_from_docx(file1.name)
+        else:
+            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None
+        if file2.name.endswith('.pdf'):
+            text2 = extract_text_from_pdf(file2.name)
+        elif file2.name.endswith('.docx'):
+            text2 = extract_text_from_docx(file2.name)
+        else:
+            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None
+    except Exception as e:
+        return f"Error processing files: {str(e)}", None
+    # Check if text extraction failed
+    if not text1 or not text2 or "Error" in text1 or "Error" in text2:
+        error_msg = ""
+        if "Error" in text1:
+            error_msg += f"Document 1: {text1} "
+        if "Error" in text2:
+            error_msg += f"Document 2: {text2}"
+        return error_msg if error_msg else "Error extracting text from one or both documents.", None
     # Calculate similarity and get similar pairs
     overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
                 output_html += f"<h5>{concept.capitalize()}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
                     output_html += f"""
+                    <div style="background-color: #f0f8ff; padding: 10px; margin: 5px; border-radius: 5px; border-left: 4px solid #4CAF50;">
                         <p><b>Document 1:</b> {sent1}</p>
                         <p><b>Document 2:</b> {sent2}</p>
                         <p><b>Similarity:</b> {score:.2%}</p>
                     </div>
                     """
     else:
+        output_html += "<p>No significant similarities found above the threshold (70%).</p>"
     # Generate similarity heatmap if there are sentences
     sentences1 = preprocess_text(text1)
     sentences2 = preprocess_text(text2)
+    heatmap_image = None
     if sentences1 and sentences2:
         # Get embeddings for visualization
         embeddings1 = model.encode(sentences1, convert_to_tensor=True)
         embeddings2 = model.encode(sentences2, convert_to_tensor=True)
         similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
+        # Generate heatmap as base64 image
+        heatmap_image = create_heatmap_image(sentences1, sentences2, similarity_matrix)
+    return output_html, heatmap_image
+# Create a clean Gradio interface
+with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 📄 Document Similarity Checker with Detailed Analysis
+    Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
     """)
     with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Upload Documents")
+            file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"])
+            file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"])
+            submit_btn = gr.Button("Compare Documents", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### Analysis Results")
+            output_html = gr.HTML(label="Similarity Analysis")
+            gr.Markdown("### Similarity Heatmap")
+            heatmap_display = gr.HTML()
+    # Define the processing function
+    def process_files(file1, file2):
+        result_html, heatmap_img = similarity(file1, file2)
+        heatmap_html = ""
+        if heatmap_img:
+            heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 5px; padding: 5px;">'
+        return result_html, heatmap_html
+    # Connect the button
+    submit_btn.click(
+        fn=process_files,
         inputs=[file1, file2],
+        outputs=[output_html, heatmap_display]
     )
+# Launch the application
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)