Spaces:

NaimaAqeel
/

document-similarity-checker

Running

App Files Files Community

NaimaAqeel commited on 1 day ago

Commit

6982985

verified ·

1 Parent(s): bca98ff

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -61

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import docx  # python-docx for DOCX extraction
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
@@ -9,17 +9,17 @@ import numpy as np
 from collections import defaultdict
 import base64
 from io import BytesIO
 # Try to import PyMuPDF with proper error handling
 pymupdf_available = False
 try:
-    # Try importing PyMuPDF directly (the correct package)
     import pymupdf
     pymupdf_available = True
     print("PyMuPDF imported successfully")
 except ImportError:
     try:
-        # Try the older import style
         import fitz
         pymupdf_available = True
         print("fitz imported successfully")
@@ -34,7 +34,6 @@ def extract_text_from_pdf(pdf_path):
         return "PDF processing not available. Please install PyMuPDF."
     try:
-        # Use the correct import based on what's available
         if 'pymupdf' in globals():
             doc = pymupdf.open(pdf_path)
         else:
@@ -107,38 +106,148 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
     overall_similarity = mean_similarity.item()
-    return overall_similarity, similar_pairs
-def create_heatmap_image(sentences1, sentences2, similarity_matrix):
-    """Create a heatmap visualization of sentence similarities and return as base64"""
     if len(sentences1) == 0 or len(sentences2) == 0:
-        return None
-    # Create figure
-    plt.figure(figsize=(10, 8))
-    plt.imshow(similarity_matrix, cmap='viridis', interpolation='nearest')
-    plt.colorbar(label='Similarity Score')
-    plt.xlabel('Document 2 Sentences')
-    plt.ylabel('Document 1 Sentences')
-    plt.title('Sentence Similarity Heatmap')
-    plt.tight_layout()
-    # Save to buffer
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     plt.close()
     buf.seek(0)
-    # Convert to base64
-    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
-    return f"data:image/png;base64,{img_base64}"
 def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
     """Group similar sentences by concept using keyword extraction"""
-    # Simple keyword-based grouping
     concept_groups = defaultdict(list)
-    # Define some common concepts for SOPs
     concepts = {
         'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
         'education': ['education', 'learn', 'course', 'degree', 'academic'],
@@ -160,76 +269,72 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
 def similarity(file1, file2):
     if file1 is None or file2 is None:
-        return "Please upload both documents.", None
-    # Extract text based on file type
     try:
         if file1.name.endswith('.pdf'):
             text1 = extract_text_from_pdf(file1.name)
         elif file1.name.endswith('.docx'):
             text1 = extract_text_from_docx(file1.name)
         else:
-            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None
         if file2.name.endswith('.pdf'):
             text2 = extract_text_from_pdf(file2.name)
         elif file2.name.endswith('.docx'):
             text2 = extract_text_from_docx(file2.name)
         else:
-            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None
     except Exception as e:
-        return f"Error processing files: {str(e)}", None
-    # Check if text extraction failed
     if not text1 or not text2 or "Error" in text1 or "Error" in text2:
         error_msg = ""
         if "Error" in text1:
             error_msg += f"Document 1: {text1} "
         if "Error" in text2:
             error_msg += f"Document 2: {text2}"
-        return error_msg if error_msg else "Error extracting text from one or both documents.", None
-    # Calculate similarity and get similar pairs
-    overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
-    # Group similar concepts
     concept_groups = group_similar_concepts(similar_pairs)
     # Prepare detailed output
-    output_html = f"<h3>Overall Similarity Score: {overall_similarity:.2%}</h3>"
     if similar_pairs:
-        output_html += "<h4>Similar Content Found:</h4>"
         for concept, pairs in concept_groups.items():
-            if pairs:  # Only show concepts with matches
-                output_html += f"<h5>{concept.capitalize()}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
                     output_html += f"""
-                    <div style="background-color: #f0f8ff; padding: 10px; margin: 5px; border-radius: 5px; border-left: 4px solid #4CAF50;">
-                        <p><b>Document 1:</b> {sent1}</p>
-                        <p><b>Document 2:</b> {sent2}</p>
-                        <p><b>Similarity:</b> {score:.2%}</p>
                     </div>
                     """
     else:
-        output_html += "<p>No significant similarities found above the threshold (70%).</p>"
-    # Generate similarity heatmap if there are sentences
     sentences1 = preprocess_text(text1)
     sentences2 = preprocess_text(text2)
-    heatmap_image = None
     if sentences1 and sentences2:
-        # Get embeddings for visualization
-        embeddings1 = model.encode(sentences1, convert_to_tensor=True)
-        embeddings2 = model.encode(sentences2, convert_to_tensor=True)
-        similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
-        # Generate heatmap as base64 image
-        heatmap_image = create_heatmap_image(sentences1, sentences2, similarity_matrix)
-    return output_html, heatmap_image
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
@@ -247,27 +352,47 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
         with gr.Column(scale=2):
             gr.Markdown("### Analysis Results")
-            output_html = gr.HTML(label="Similarity Analysis")
-            gr.Markdown("### Similarity Heatmap")
             heatmap_display = gr.HTML()
     # Define the processing function
     def process_files(file1, file2):
-        result_html, heatmap_img = similarity(file1, file2)
-        heatmap_html = ""
         if heatmap_img:
-            heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 5px; padding: 5px;">'
-        return result_html, heatmap_html
     # Connect the button
     submit_btn.click(
         fn=process_files,
         inputs=[file1, file2],
-        outputs=[output_html, heatmap_display]
     )
 # Launch the application
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
+import docx
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
 from collections import defaultdict
 import base64
 from io import BytesIO
+import pandas as pd
+import seaborn as sns
 # Try to import PyMuPDF with proper error handling
 pymupdf_available = False
 try:
     import pymupdf
     pymupdf_available = True
     print("PyMuPDF imported successfully")
 except ImportError:
     try:
         import fitz
         pymupdf_available = True
         print("fitz imported successfully")
         return "PDF processing not available. Please install PyMuPDF."
     try:
         if 'pymupdf' in globals():
             doc = pymupdf.open(pdf_path)
         else:
     mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
     overall_similarity = mean_similarity.item()
+    return overall_similarity, similar_pairs, cosine_similarities.cpu().numpy()
+def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
+    """Create multiple visualizations for similarity analysis"""
     if len(sentences1) == 0 or len(sentences2) == 0:
+        return None, None, None
+    visualizations = []
+    # 1. Improved Heatmap
+    plt.figure(figsize=(12, 10))
+    # Create a mask for values below threshold to make the heatmap clearer
+    mask = similarity_matrix < 0.5
+    # Use a diverging color palette for better contrast
+    ax = sns.heatmap(similarity_matrix,
+                    mask=mask,
+                    cmap='RdYlBu_r',
+                    center=0.7,
+                    xticklabels=False,
+                    yticklabels=False,
+                    cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
+    plt.title('Document Similarity Heatmap\n(Brighter colors = Higher similarity)', fontsize=14, pad=20)
+    plt.xlabel('Document 2 Sentences', fontsize=12)
+    plt.ylabel('Document 1 Sentences', fontsize=12)
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+    plt.close()
+    buf.seek(0)
+    heatmap_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
+    visualizations.append(heatmap_img)
+    # 2. Similarity Distribution Chart
+    plt.figure(figsize=(10, 6))
+    # Flatten the similarity matrix and filter out low similarities
+    flat_similarities = similarity_matrix.flatten()
+    flat_similarities = flat_similarities[flat_similarities > 0.3]  # Only show meaningful similarities
+    plt.hist(flat_similarities, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
+    plt.axvline(x=0.7, color='red', linestyle='--', label='Similarity Threshold (70%)')
+    plt.xlabel('Similarity Score')
+    plt.ylabel('Frequency')
+    plt.title('Distribution of Sentence Similarities')
+    plt.legend()
+    plt.grid(True, alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     plt.close()
     buf.seek(0)
+    dist_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
+    visualizations.append(dist_img)
+    # 3. Top Similarity Pairs Bar Chart
+    plt.figure(figsize=(12, 8))
+    # Get top similarity scores and their positions
+    top_n = min(10, len(sentences1) * len(sentences2))
+    if top_n > 0:
+        # Flatten and get indices of top values
+        flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
+        top_scores = similarity_matrix.flatten()[flat_indices]
+        # Convert flat indices to 2D indices
+        rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
+        # Create labels
+        labels = [f"Sent {r+1} ↔ Sent {c+1}" for r, c in zip(rows, cols)]
+        plt.barh(range(len(top_scores)), top_scores, color='lightcoral')
+        plt.yticks(range(len(top_scores)), labels)
+        plt.xlabel('Similarity Score')
+        plt.title('Top 10 Most Similar Sentence Pairs')
+        plt.grid(True, alpha=0.3, axis='x')
+        buf = BytesIO()
+        plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+        plt.close()
+        buf.seek(0)
+        top_pairs_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
+        visualizations.append(top_pairs_img)
+    else:
+        visualizations.append(None)
+    return visualizations
+def create_similarity_summary(overall_similarity, similar_pairs):
+    """Create a text summary of the similarity analysis"""
+    summary = f"## 📊 Similarity Summary\n\n"
+    summary += f"**Overall Similarity Score:** {overall_similarity:.2%}\n\n"
+    if similar_pairs:
+        summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
+        # Group by similarity ranges
+        high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
+        med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
+        summary += "**Similarity Breakdown:**\n"
+        summary += f"- High Similarity (≥90%): {high_sim} pairs\n"
+        summary += f"- Medium Similarity (70-89%): {med_sim} pairs\n\n"
+        # Most common concepts
+        concepts = {
+            'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
+            'education': ['education', 'learn', 'course', 'degree', 'academic'],
+            'experience': ['experience', 'work', 'job', 'intern', 'position'],
+            'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
+            'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
+        }
+        concept_counts = {concept: 0 for concept in concepts.keys()}
+        concept_counts['other'] = 0
+        for sent1, sent2, score in similar_pairs:
+            matched = False
+            for concept, keywords in concepts.items():
+                if any(keyword in sent1.lower() for keyword in keywords) or \
+                   any(keyword in sent2.lower() for keyword in keywords):
+                    concept_counts[concept] += 1
+                    matched = True
+                    break
+            if not matched:
+                concept_counts['other'] += 1
+        summary += "**Similar Content by Category:**\n"
+        for concept, count in concept_counts.items():
+            if count > 0:
+                summary += f"- {concept.capitalize()}: {count} pairs\n"
+    else:
+        summary += "No significant similarities found above the 70% threshold.\n"
+    return summary
 def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
     """Group similar sentences by concept using keyword extraction"""
     concept_groups = defaultdict(list)
     concepts = {
         'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
         'education': ['education', 'learn', 'course', 'degree', 'academic'],
 def similarity(file1, file2):
     if file1 is None or file2 is None:
+        return "Please upload both documents.", None, None, None, None
     try:
         if file1.name.endswith('.pdf'):
             text1 = extract_text_from_pdf(file1.name)
         elif file1.name.endswith('.docx'):
             text1 = extract_text_from_docx(file1.name)
         else:
+            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None, None, None
         if file2.name.endswith('.pdf'):
             text2 = extract_text_from_pdf(file2.name)
         elif file2.name.endswith('.docx'):
             text2 = extract_text_from_docx(file2.name)
         else:
+            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None, None, None
     except Exception as e:
+        return f"Error processing files: {str(e)}", None, None, None, None
     if not text1 or not text2 or "Error" in text1 or "Error" in text2:
         error_msg = ""
         if "Error" in text1:
             error_msg += f"Document 1: {text1} "
         if "Error" in text2:
             error_msg += f"Document 2: {text2}"
+        return error_msg if error_msg else "Error extracting text from one or both documents.", None, None, None, None
+    overall_similarity, similar_pairs, similarity_matrix = calculate_cosine_similarity(text1, text2)
     concept_groups = group_similar_concepts(similar_pairs)
     # Prepare detailed output
+    output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
     if similar_pairs:
+        output_html += f"<h4>Found {len(similar_pairs)} similar sentence pairs:</h4>"
         for concept, pairs in concept_groups.items():
+            if pairs:
+                output_html += f"<h5>🔍 {concept.capitalize()}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
+                    color = "#4CAF50" if score >= 0.9 else "#FF9800" if score >= 0.7 else "#F44336"
                     output_html += f"""
+                    <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
+                        <p><b>📄 Document 1:</b> {sent1}</p>
+                        <p><b>📄 Document 2:</b> {sent2}</p>
+                        <p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p>
                     </div>
                     """
     else:
+        output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
+        output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
+        output_html += "</div>"
+    # Generate visualizations
     sentences1 = preprocess_text(text1)
     sentences2 = preprocess_text(text2)
+    visualizations = [None, None, None]
+    summary_text = ""
     if sentences1 and sentences2:
+        visualizations = create_similarity_visualizations(sentences1, sentences2, similarity_matrix)
+        summary_text = create_similarity_summary(overall_similarity, similar_pairs)
+    return output_html, summary_text, visualizations[0], visualizations[1], visualizations[2]
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
         with gr.Column(scale=2):
             gr.Markdown("### Analysis Results")
+            summary_output = gr.Markdown()
+            output_html = gr.HTML(label="Detailed Similarities")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 📈 Similarity Heatmap")
             heatmap_display = gr.HTML()
+        with gr.Column():
+            gr.Markdown("### 📊 Similarity Distribution")
+            dist_display = gr.HTML()
+    with gr.Row():
+        gr.Markdown("### 🔝 Top Similar Pairs")
+        top_pairs_display = gr.HTML()
     # Define the processing function
     def process_files(file1, file2):
+        result_html, summary_text, heatmap_img, dist_img, top_pairs_img = similarity(file1, file2)
+        heatmap_html = "<p>No visualization available</p>"
         if heatmap_img:
+            heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
+        dist_html = "<p>No visualization available</p>"
+        if dist_img:
+            dist_html = f'<img src="{dist_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
+        top_pairs_html = "<p>No visualization available</p>"
+        if top_pairs_img:
+            top_pairs_html = f'<img src="{top_pairs_img}" alt="Top Similar Pairs" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
+        return result_html, summary_text, heatmap_html, dist_html, top_pairs_html
     # Connect the button
     submit_btn.click(
         fn=process_files,
         inputs=[file1, file2],
+        outputs=[output_html, summary_output, heatmap_display, dist_display, top_pairs_display]
     )
 # Launch the application
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)