Spaces:

NaimaAqeel
/

document-similarity-checker

Running

App Files Files Community

NaimaAqeel commited on 1 day ago

Commit

092f11f

verified ·

1 Parent(s): 714e663

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -199

app.py CHANGED Viewed

@@ -9,8 +9,6 @@ import numpy as np
 from collections import defaultdict
 import base64
 from io import BytesIO
-import pandas as pd
-import seaborn as sns
 # Try to import PyMuPDF with proper error handling
 pymupdf_available = False
@@ -75,7 +73,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     sentences2 = preprocess_text(doc2)
     if not sentences1 or not sentences2:
-        return 0.0, [], np.array([])
     # Get embeddings for all sentences
     embeddings1 = model.encode(sentences1, convert_to_tensor=True)
@@ -107,142 +105,58 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
     overall_similarity = mean_similarity
-    return overall_similarity, similar_pairs, similarity_matrix
-def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
-    """Create multiple visualizations for similarity analysis"""
-    if len(sentences1) == 0 or len(sentences2) == 0:
-        return None, None, None
-    visualizations = []
-    # 1. Improved Heatmap with clear explanation
-    plt.figure(figsize=(14, 10))
-    # Create a mask for values below threshold to make the heatmap clearer
-    mask = similarity_matrix < 0.3
-    # Use a clear color palette
-    ax = sns.heatmap(similarity_matrix,
-                    mask=mask,
-                    cmap='YlOrRd',
-                    vmin=0.3,
-                    vmax=1.0,
-                    xticklabels=False,
-                    yticklabels=False,
-                    cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
-    plt.title('Document Similarity Heatmap\n\n🔴 Red = Very Similar  🟡 Yellow = Somewhat Similar  ⚪ White = Not Similar',
-              fontsize=16, pad=20)
-    plt.xlabel('Document 2 Sentences', fontsize=14)
-    plt.ylabel('Document 1 Sentences', fontsize=14)
-    # Add explanation text
-    explanation_text = (
-        "This heatmap shows how similar each sentence in Document 1 is to each sentence in Document 2.\n"
-        "Bright red areas indicate very similar content, yellow areas show some similarity, \n"
-        "and white areas indicate little to no similarity."
-    )
-    plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
-    buf = BytesIO()
-    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-    plt.close()
-    buf.seek(0)
-    heatmap_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
-    visualizations.append(heatmap_img)
-    # 2. Similarity Distribution Chart
     plt.figure(figsize=(12, 8))
-    # Flatten the similarity matrix and filter out low similarities
-    flat_similarities = similarity_matrix.flatten()
-    flat_similarities = flat_similarities[flat_similarities > 0.3]  # Only show meaningful similarities
     # Create bins with labels
-    bins = [0.3, 0.5, 0.7, 0.9, 1.0]
-    bin_labels = ['Low (30-50%)', 'Medium (50-70%)', 'High (70-90%)', 'Very High (90-100%)']
-    # Create histogram
-    counts, bin_edges = np.histogram(flat_similarities, bins=bins)
     # Create bar chart with colors
-    colors = ['#ff9999', '#ffcc99', '#c2e699', '#66b3ff']
-    bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black')
     # Add value labels on bars
     for i, (count, bar) in enumerate(zip(counts, bars)):
-        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
-                str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
-    plt.axvline(x=1.5, color='red', linestyle='--', linewidth=2, label='Similarity Threshold (70%)')
-    plt.xlabel('Similarity Level', fontsize=14)
-    plt.ylabel('Number of Sentence Pairs', fontsize=14)
-    plt.title('Distribution of Sentence Similarities', fontsize=16)
-    plt.xticks(range(len(bin_labels)), bin_labels, rotation=45, ha='right')
-    plt.legend(fontsize=12)
-    plt.grid(True, alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     plt.close()
     buf.seek(0)
-    dist_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
-    visualizations.append(dist_img)
-    # 3. Top Similarity Pairs Bar Chart
-    plt.figure(figsize=(14, 10))
-    # Get top similarity scores and their positions
-    top_n = min(8, len(sentences1) * len(sentences2))
-    if top_n > 0:
-        # Flatten and get indices of top values
-        flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
-        top_scores = similarity_matrix.flatten()[flat_indices]
-        # Convert flat indices to 2D indices
-        rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
-        # Create shortened labels for readability
-        labels = []
-        for r, c in zip(rows, cols):
-            sent1_short = sentences1[r][:50] + "..." if len(sentences1[r]) > 50 else sentences1[r]
-            sent2_short = sentences2[c][:50] + "..." if len(sentences2[c]) > 50 else sentences2[c]
-            labels.append(f"Pair {r+1}-{c+1}")
-        colors = ['#ff6666' if score >= 0.9 else '#ffcc66' if score >= 0.7 else '#66b3ff' for score in top_scores]
-        bars = plt.barh(range(len(top_scores)), top_scores, color=colors, edgecolor='black')
-        # Add value labels
-        for i, (score, bar) in enumerate(zip(top_scores, bars)):
-            plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
-                    f'{score:.2%}', ha='left', va='center', fontsize=11, fontweight='bold')
-        plt.yticks(range(len(top_scores)), labels, fontsize=11)
-        plt.xlabel('Similarity Score', fontsize=14)
-        plt.title('Top 8 Most Similar Sentence Pairs', fontsize=16)
-        plt.xlim(0, 1.1)
-        plt.grid(True, alpha=0.3, axis='x')
-        # Add legend for colors
-        from matplotlib.patches import Patch
-        legend_elements = [
-            Patch(facecolor='#ff6666', label='Very Similar (≥90%)'),
-            Patch(facecolor='#ffcc66', label='Similar (70-89%)'),
-            Patch(facecolor='#66b3ff', label='Somewhat Similar (30-69%)')
-        ]
-        plt.legend(handles=legend_elements, loc='lower right')
-        buf = BytesIO()
-        plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-        plt.close()
-        buf.seek(0)
-        top_pairs_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
-        visualizations.append(top_pairs_img)
-    else:
-        visualizations.append(None)
-    return visualizations
 def create_similarity_summary(overall_similarity, similar_pairs):
     """Create a text summary of the similarity analysis"""
@@ -254,25 +168,25 @@ def create_similarity_summary(overall_similarity, similar_pairs):
         # Group by similarity ranges
         high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
-        med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
-        low_sim = len([p for p in similar_pairs if 0.3 <= p[2] < 0.7])
         summary += "**Similarity Breakdown:**\n"
-        summary += f"- 🔴 Very High Similarity (≥90%): {high_sim} pairs\n"
-        summary += f"- 🟡 High Similarity (70-89%): {med_sim} pairs\n"
-        summary += f"- 🔵 Some Similarity (30-69%): {low_sim} pairs\n\n"
         # Most common concepts
         concepts = {
-            'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
-            'education': ['education', 'learn', 'course', 'degree', 'academic'],
-            'experience': ['experience', 'work', 'job', 'intern', 'position'],
-            'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
-            'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
         }
         concept_counts = {concept: 0 for concept in concepts.keys()}
-        concept_counts['other'] = 0
         for sent1, sent2, score in similar_pairs:
             matched = False
@@ -283,14 +197,14 @@ def create_similarity_summary(overall_similarity, similar_pairs):
                     matched = True
                     break
             if not matched:
-                concept_counts['other'] += 1
         summary += "**Similar Content by Category:**\n"
         for concept, count in concept_counts.items():
             if count > 0:
-                summary += f"- {concept.capitalize()}: {count} pairs\n"
     else:
-        summary += "No significant similarities found above the 30% threshold.\n"
     return summary
@@ -299,15 +213,15 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
     concept_groups = defaultdict(list)
     concepts = {
-        'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
-        'education': ['education', 'learn', 'course', 'degree', 'academic'],
-        'experience': ['experience', 'work', 'job', 'intern', 'position'],
-        'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
-        'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
     }
     for sent1, sent2, score in similar_pairs:
-        matched_concept = 'other'
         for concept, keywords in concepts.items():
             if any(keyword in sent1.lower() for keyword in keywords) or \
                any(keyword in sent2.lower() for keyword in keywords):
@@ -319,7 +233,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
 def similarity(file1, file2):
     if file1 is None or file2 is None:
-        return "Please upload both documents.", None, None, None, None
     try:
         if file1.name.endswith('.pdf'):
@@ -327,16 +241,16 @@ def similarity(file1, file2):
         elif file1.name.endswith('.docx'):
             text1 = extract_text_from_docx(file1.name)
         else:
-            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None, None, None
         if file2.name.endswith('.pdf'):
             text2 = extract_text_from_pdf(file2.name)
         elif file2.name.endswith('.docx'):
             text2 = extract_text_from_docx(file2.name)
         else:
-            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None, None, None
     except Exception as e:
-        return f"Error processing files: {str(e)}", None, None, None, None
     if not text1 or not text2 or "Error" in text1 or "Error" in text2:
         error_msg = ""
@@ -344,9 +258,9 @@ def similarity(file1, file2):
             error_msg += f"Document 1: {text1} "
         if "Error" in text2:
             error_msg += f"Document 2: {text2}"
-        return error_msg if error_msg else "Error extracting text from one or both documents.", None, None, None, None
-    overall_similarity, similar_pairs, similarity_matrix = calculate_cosine_similarity(text1, text2)
     concept_groups = group_similar_concepts(similar_pairs)
@@ -358,9 +272,9 @@ def similarity(file1, file2):
         for concept, pairs in concept_groups.items():
             if pairs:
-                output_html += f"<h5>🔍 {concept.capitalize()}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
-                    color = "#ff6666" if score >= 0.9 else "#ffcc66" if score >= 0.7 else "#66b3ff"
                     output_html += f"""
                     <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
                         <p><b>📄 Document 1:</b> {sent1}</p>
@@ -370,26 +284,19 @@ def similarity(file1, file2):
                     """
     else:
         output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
-        output_html += "<p>⚠️ No significant similarities found above the threshold (30%).</p>"
         output_html += "</div>"
-    # Generate visualizations
-    sentences1 = preprocess_text(text1)
-    sentences2 = preprocess_text(text2)
-    visualizations = [None, None, None]
-    summary_text = ""
-    if sentences1 and sentences2:
-        visualizations = create_similarity_visualizations(sentences1, sentences2, similarity_matrix)
-        summary_text = create_similarity_summary(overall_similarity, similar_pairs)
-    return output_html, summary_text, visualizations[0], visualizations[1], visualizations[2]
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 📄 Document Similarity Checker with Detailed Analysis
     Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
     """)
@@ -405,58 +312,35 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
             summary_output = gr.Markdown()
             output_html = gr.HTML(label="Detailed Similarities")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("""
-            ### 📈 Similarity Heatmap
-            **Color Guide:**
-            - 🔴 Red = Very Similar (90-100%)
-            - 🟡 Yellow = Somewhat Similar (70-89%)
-            - ⚪ White = Not Similar (0-69%)
-            """)
-            heatmap_display = gr.HTML()
-        with gr.Column():
-            gr.Markdown("""
-            ### 📊 Similarity Distribution
-            Shows how many sentence pairs fall into each similarity range.
-            The red line indicates the 70% similarity threshold.
-            """)
-            dist_display = gr.HTML()
-    with gr.Row():
-        gr.Markdown("""
-        ### 🔝 Top Similar Pairs
-        The most similar sentences between your documents, with similarity scores.
-        """)
-        top_pairs_display = gr.HTML()
     # Define the processing function
     def process_files(file1, file2):
-        result_html, summary_text, heatmap_img, dist_img, top_pairs_img = similarity(file1, file2)
-        heatmap_html = "<p>No visualization available</p>"
-        if heatmap_img:
-            heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
-        dist_html = "<p>No visualization available</p>"
-        if dist_img:
-            dist_html = f'<img src="{dist_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
-        top_pairs_html = "<p>No visualization available</p>"
-        if top_pairs_img:
-            top_pairs_html = f'<img src="{top_pairs_img}" alt="Top Similar Pairs" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
-        return result_html, summary_text, heatmap_html, dist_html, top_pairs_html
     # Connect the button
     submit_btn.click(
         fn=process_files,
         inputs=[file1, file2],
-        outputs=[output_html, summary_output, heatmap_display, dist_display, top_pairs_display]
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 from collections import defaultdict
 import base64
 from io import BytesIO
 # Try to import PyMuPDF with proper error handling
 pymupdf_available = False
     sentences2 = preprocess_text(doc2)
     if not sentences1 or not sentences2:
+        return 0.0, []
     # Get embeddings for all sentences
     embeddings1 = model.encode(sentences1, convert_to_tensor=True)
     mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
     overall_similarity = mean_similarity
+    return overall_similarity, similar_pairs
+def create_similarity_barchart(similar_pairs):
+    """Create a bar chart showing similarity distribution"""
+    if not similar_pairs:
+        return None
     plt.figure(figsize=(12, 8))
+    # Extract similarity scores
+    scores = [pair[2] for pair in similar_pairs]
     # Create bins with labels
+    bins = [0.7, 0.8, 0.9, 1.0]
+    bin_labels = ['Good (70-79%)', 'Strong (80-89%)', 'Very Strong (90-100%)']
+    # Count pairs in each bin
+    counts, _ = np.histogram(scores, bins=bins)
     # Create bar chart with colors
+    colors = ['#ffcc66', '#ffaa44', '#ff6666']
+    bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.6)
     # Add value labels on bars
     for i, (count, bar) in enumerate(zip(counts, bars)):
+        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
+                str(count), ha='center', va='bottom', fontsize=14, fontweight='bold')
+    plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
+    plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
+    plt.title('Document Similarity Distribution', fontsize=16, fontweight='bold', pad=20)
+    plt.xticks(range(len(bin_labels)), bin_labels, fontsize=12)
+    # Remove top and right spines
+    plt.gca().spines['top'].set_visible(False)
+    plt.gca().spines['right'].set_visible(False)
+    # Add grid for better readability
+    plt.grid(axis='y', alpha=0.3)
+    # Add explanation
+    plt.figtext(0.5, 0.01,
+               "This chart shows how many sentence pairs fall into each similarity range.\n"
+               "Higher bars indicate more content shared between documents at that similarity level.",
+               ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     plt.close()
     buf.seek(0)
+    return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
 def create_similarity_summary(overall_similarity, similar_pairs):
     """Create a text summary of the similarity analysis"""
         # Group by similarity ranges
         high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
+        med_sim = len([p for p in similar_pairs if 0.8 <= p[2] < 0.9])
+        low_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.8])
         summary += "**Similarity Breakdown:**\n"
+        summary += f"- 🔴 Very Strong Similarity (90-100%): {high_sim} pairs\n"
+        summary += f"- 🟡 Strong Similarity (80-89%): {med_sim} pairs\n"
+        summary += f"- 🟠 Good Similarity (70-79%): {low_sim} pairs\n\n"
         # Most common concepts
         concepts = {
+            'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
+            'Education': ['education', 'learn', 'course', 'degree', 'academic'],
+            'Experience': ['experience', 'work', 'job', 'intern', 'position'],
+            'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
+            'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
         }
         concept_counts = {concept: 0 for concept in concepts.keys()}
+        concept_counts['Other'] = 0
         for sent1, sent2, score in similar_pairs:
             matched = False
                     matched = True
                     break
             if not matched:
+                concept_counts['Other'] += 1
         summary += "**Similar Content by Category:**\n"
         for concept, count in concept_counts.items():
             if count > 0:
+                summary += f"- {concept}: {count} pairs\n"
     else:
+        summary += "No significant similarities found above the 70% threshold.\n"
     return summary
     concept_groups = defaultdict(list)
     concepts = {
+        'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
+        'Education': ['education', 'learn', 'course', 'degree', 'academic'],
+        'Experience': ['experience', 'work', 'job', 'intern', 'position'],
+        'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
+        'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
     }
     for sent1, sent2, score in similar_pairs:
+        matched_concept = 'Other'
         for concept, keywords in concepts.items():
             if any(keyword in sent1.lower() for keyword in keywords) or \
                any(keyword in sent2.lower() for keyword in keywords):
 def similarity(file1, file2):
     if file1 is None or file2 is None:
+        return "Please upload both documents.", None, None
     try:
         if file1.name.endswith('.pdf'):
         elif file1.name.endswith('.docx'):
             text1 = extract_text_from_docx(file1.name)
         else:
+            return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
         if file2.name.endswith('.pdf'):
             text2 = extract_text_from_pdf(file2.name)
         elif file2.name.endswith('.docx'):
             text2 = extract_text_from_docx(file2.name)
         else:
+            return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
     except Exception as e:
+        return f"Error processing files: {str(e)}", None, None
     if not text1 or not text2 or "Error" in text1 or "Error" in text2:
         error_msg = ""
             error_msg += f"Document 1: {text1} "
         if "Error" in text2:
             error_msg += f"Document 2: {text2}"
+        return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
+    overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
     concept_groups = group_similar_concepts(similar_pairs)
         for concept, pairs in concept_groups.items():
             if pairs:
+                output_html += f"<h5>��� {concept}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
+                    color = "#ff6666" if score >= 0.9 else "#ffaa44" if score >= 0.8 else "#ffcc66"
                     output_html += f"""
                     <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
                         <p><b>📄 Document 1:</b> {sent1}</p>
                     """
     else:
         output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
+        output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
         output_html += "</div>"
+    # Generate bar chart
+    barchart_image = create_similarity_barchart(similar_pairs)
+    summary_text = create_similarity_summary(overall_similarity, similar_pairs)
+    return output_html, summary_text, barchart_image
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 📄 Document Similarity Checker
     Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
     """)
             summary_output = gr.Markdown()
             output_html = gr.HTML(label="Detailed Similarities")
+    gr.Markdown("""
+    ### 📊 Similarity Distribution
+    **Color Guide:**
+    - 🔴 Very Strong Similarity (90-100%)
+    - 🟡 Strong Similarity (80-89%)
+    - 🟠 Good Similarity (70-79%)
+    """)
+    barchart_display = gr.HTML()
     # Define the processing function
     def process_files(file1, file2):
+        result_html, summary_text, barchart_img = similarity(file1, file2)
+        barchart_html = "<p>No similarity data available for visualization</p>"
+        if barchart_img:
+            barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
+        return result_html, summary_text, barchart_html
     # Connect the button
     submit_btn.click(
         fn=process_files,
         inputs=[file1, file2],
+        outputs=[output_html, summary_output, barchart_display]
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)