Spaces:

NaimaAqeel
/

document-similarity-checker

Running

App Files Files Community

NaimaAqeel commited on 1 day ago

Commit

28ea54b

verified ·

1 Parent(s): 092f11f

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -93

app.py CHANGED Viewed

@@ -83,21 +83,17 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
     similarity_matrix = cosine_similarities.cpu().numpy()
-    # Find the most similar sentences
-    similar_pairs = []
-    threshold = 0.7  # Similarity threshold for highlighting
     for i in range(len(sentences1)):
-        max_similarity = 0
-        best_match_idx = -1
         for j in range(len(sentences2)):
-            if similarity_matrix[i][j] > max_similarity:
-                max_similarity = similarity_matrix[i][j]
-                best_match_idx = j
-        if max_similarity > threshold and best_match_idx != -1:
-            similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity))
     # Calculate overall similarity
     max_similarities1 = np.max(similarity_matrix, axis=1)
@@ -105,38 +101,45 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
     mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
     overall_similarity = mean_similarity
-    return overall_similarity, similar_pairs
-def create_similarity_barchart(similar_pairs):
-    """Create a bar chart showing similarity distribution"""
-    if not similar_pairs:
         return None
-    plt.figure(figsize=(12, 8))
     # Extract similarity scores
-    scores = [pair[2] for pair in similar_pairs]
-    # Create bins with labels
-    bins = [0.7, 0.8, 0.9, 1.0]
-    bin_labels = ['Good (70-79%)', 'Strong (80-89%)', 'Very Strong (90-100%)']
     # Count pairs in each bin
     counts, _ = np.histogram(scores, bins=bins)
-    # Create bar chart with colors
-    colors = ['#ffcc66', '#ffaa44', '#ff6666']
-    bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.6)
     # Add value labels on bars
     for i, (count, bar) in enumerate(zip(counts, bars)):
-        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
-                str(count), ha='center', va='bottom', fontsize=14, fontweight='bold')
     plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
     plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
-    plt.title('Document Similarity Distribution', fontsize=16, fontweight='bold', pad=20)
-    plt.xticks(range(len(bin_labels)), bin_labels, fontsize=12)
     # Remove top and right spines
     plt.gca().spines['top'].set_visible(False)
@@ -146,10 +149,12 @@ def create_similarity_barchart(similar_pairs):
     plt.grid(axis='y', alpha=0.3)
     # Add explanation
-    plt.figtext(0.5, 0.01,
-               "This chart shows how many sentence pairs fall into each similarity range.\n"
-               "Higher bars indicate more content shared between documents at that similarity level.",
-               ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
@@ -158,57 +163,62 @@ def create_similarity_barchart(similar_pairs):
     return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
-def create_similarity_summary(overall_similarity, similar_pairs):
     """Create a text summary of the similarity analysis"""
-    summary = f"## 📊 Similarity Summary\n\n"
     summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
-    if similar_pairs:
-        summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
-        # Group by similarity ranges
-        high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
-        med_sim = len([p for p in similar_pairs if 0.8 <= p[2] < 0.9])
-        low_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.8])
         summary += "**Similarity Breakdown:**\n"
-        summary += f"- 🔴 Very Strong Similarity (90-100%): {high_sim} pairs\n"
-        summary += f"- 🟡 Strong Similarity (80-89%): {med_sim} pairs\n"
-        summary += f"- 🟠 Good Similarity (70-79%): {low_sim} pairs\n\n"
-        # Most common concepts
-        concepts = {
-            'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
-            'Education': ['education', 'learn', 'course', 'degree', 'academic'],
-            'Experience': ['experience', 'work', 'job', 'intern', 'position'],
-            'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
-            'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
-        }
-        concept_counts = {concept: 0 for concept in concepts.keys()}
-        concept_counts['Other'] = 0
-        for sent1, sent2, score in similar_pairs:
-            matched = False
-            for concept, keywords in concepts.items():
-                if any(keyword in sent1.lower() for keyword in keywords) or \
-                   any(keyword in sent2.lower() for keyword in keywords):
-                    concept_counts[concept] += 1
-                    matched = True
-                    break
-            if not matched:
-                concept_counts['Other'] += 1
-        summary += "**Similar Content by Category:**\n"
-        for concept, count in concept_counts.items():
-            if count > 0:
-                summary += f"- {concept}: {count} pairs\n"
     else:
-        summary += "No significant similarities found above the 70% threshold.\n"
     return summary
-def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
     """Group similar sentences by concept using keyword extraction"""
     concept_groups = defaultdict(list)
@@ -220,7 +230,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
         'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
     }
-    for sent1, sent2, score in similar_pairs:
         matched_concept = 'Other'
         for concept, keywords in concepts.items():
             if any(keyword in sent1.lower() for keyword in keywords) or \
@@ -231,6 +241,19 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
     return concept_groups
 def similarity(file1, file2):
     if file1 is None or file2 is None:
         return "Please upload both documents.", None, None
@@ -260,21 +283,23 @@ def similarity(file1, file2):
             error_msg += f"Document 2: {text2}"
         return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
-    overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
-    concept_groups = group_similar_concepts(similar_pairs)
     # Prepare detailed output
     output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
-    if similar_pairs:
-        output_html += f"<h4>Found {len(similar_pairs)} similar sentence pairs:</h4>"
         for concept, pairs in concept_groups.items():
             if pairs:
                 output_html += f"<h5>🔍 {concept}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
-                    color = "#ff6666" if score >= 0.9 else "#ffaa44" if score >= 0.8 else "#ffcc66"
                     output_html += f"""
                     <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
                         <p><b>📄 Document 1:</b> {sent1}</p>
@@ -284,20 +309,20 @@ def similarity(file1, file2):
                     """
     else:
         output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
-        output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
         output_html += "</div>"
-    # Generate bar chart
-    barchart_image = create_similarity_barchart(similar_pairs)
-    summary_text = create_similarity_summary(overall_similarity, similar_pairs)
     return output_html, summary_text, barchart_image
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 📄 Document Similarity Checker
-    Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
     """)
     with gr.Row():
@@ -310,14 +335,18 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
         with gr.Column(scale=2):
             gr.Markdown("### Analysis Results")
             summary_output = gr.Markdown()
-            output_html = gr.HTML(label="Detailed Similarities")
     gr.Markdown("""
-    ### 📊 Similarity Distribution
     **Color Guide:**
-    - 🔴 Very Strong Similarity (90-100%)
-    - 🟡 Strong Similarity (80-89%)
-    - 🟠 Good Similarity (70-79%)
     """)
     barchart_display = gr.HTML()
@@ -327,7 +356,7 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
         barchart_html = "<p>No similarity data available for visualization</p>"
         if barchart_img:
-            barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
         return result_html, summary_text, barchart_html
@@ -341,6 +370,5 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
 # Launch the application
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

     cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
     similarity_matrix = cosine_similarities.cpu().numpy()
+    # Find the most similar sentences (all pairs for comprehensive analysis)
+    all_pairs = []
     for i in range(len(sentences1)):
         for j in range(len(sentences2)):
+            similarity_score = similarity_matrix[i][j]
+            if similarity_score > 0.3:  # Include even lower similarities for comprehensive analysis
+                all_pairs.append((sentences1[i], sentences2[j], similarity_score))
+    # Sort by similarity score (highest first)
+    all_pairs.sort(key=lambda x: x[2], reverse=True)
     # Calculate overall similarity
     max_similarities1 = np.max(similarity_matrix, axis=1)
     mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
     overall_similarity = mean_similarity
+    return overall_similarity, all_pairs
+def create_similarity_barchart(all_pairs):
+    """Create a bar chart showing similarity distribution across all levels"""
+    if not all_pairs:
         return None
+    plt.figure(figsize=(14, 8))
     # Extract similarity scores
+    scores = [pair[2] for pair in all_pairs]
+    # Create bins for all similarity levels
+    bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
+    bin_labels = [
+        'Slightly Related\n(30-49%)',
+        'Somewhat Related\n(50-69%)',
+        'Good Similarity\n(70-79%)',
+        'Strong Similarity\n(80-89%)',
+        'Very Strong Similarity\n(90-100%)'
+    ]
     # Count pairs in each bin
     counts, _ = np.histogram(scores, bins=bins)
+    # Create bar chart with colors for all levels
+    colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
+    bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)
     # Add value labels on bars
     for i, (count, bar) in enumerate(zip(counts, bars)):
+        if count > 0:
+            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
+                    str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
     plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
     plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
+    plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
+    plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)
     # Remove top and right spines
     plt.gca().spines['top'].set_visible(False)
     plt.grid(axis='y', alpha=0.3)
     # Add explanation
+    explanation_text = (
+        "This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
+        "Pairs with less than 30% similarity are not shown as they are considered not similar."
+    )
+    plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic',
+                bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
     return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
+def create_similarity_summary(overall_similarity, all_pairs):
     """Create a text summary of the similarity analysis"""
+    summary = f"## 📊 Complete Similarity Analysis\n\n"
     summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
+    if all_pairs:
+        # Count pairs in each category
+        very_strong = len([p for p in all_pairs if p[2] >= 0.9])
+        strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
+        good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
+        somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
+        slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])
         summary += "**Similarity Breakdown:**\n"
+        summary += f"- 🔴 Very Strong Similarity (90-100%): {very_strong} pairs\n"
+        summary += f"- 🟡 Strong Similarity (80-89%): {strong} pairs\n"
+        summary += f"- 🟠 Good Similarity (70-79%): {good} pairs\n"
+        summary += f"- 🔵 Somewhat Related (50-69%): {somewhat_related} pairs\n"
+        summary += f"- ⚪ Slightly Related (30-49%): {slightly_related} pairs\n"
+        summary += f"- ❌ Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"
+        # Most common concepts in higher similarity pairs
+        high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
+        if high_similarity_pairs:
+            concepts = {
+                'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
+                'Education': ['education', 'learn', 'course', 'degree', 'academic'],
+                'Experience': ['experience', 'work', 'job', 'intern', 'position'],
+                'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
+                'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
+            }
+            concept_counts = {concept: 0 for concept in concepts.keys()}
+            concept_counts['Other'] = 0
+            for sent1, sent2, score in high_similarity_pairs:
+                matched = False
+                for concept, keywords in concepts.items():
+                    if any(keyword in sent1.lower() for keyword in keywords) or \
+                       any(keyword in sent2.lower() for keyword in keywords):
+                        concept_counts[concept] += 1
+                        matched = True
+                        break
+                if not matched:
+                    concept_counts['Other'] += 1
+            summary += "**Highly Similar Content by Category:**\n"
+            for concept, count in concept_counts.items():
+                if count > 0:
+                    summary += f"- {concept}: {count} pairs\n"
     else:
+        summary += "No significant similarities found above the 30% threshold.\n"
     return summary
+def group_similar_concepts(all_pairs):
     """Group similar sentences by concept using keyword extraction"""
     concept_groups = defaultdict(list)
         'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
     }
+    for sent1, sent2, score in all_pairs:
         matched_concept = 'Other'
         for concept, keywords in concepts.items():
             if any(keyword in sent1.lower() for keyword in keywords) or \
     return concept_groups
+def get_similarity_color(score):
+    """Get color based on similarity score"""
+    if score >= 0.9:
+        return "#ff6666"  # Red - Very Strong
+    elif score >= 0.8:
+        return "#ffaa44"  # Orange - Strong
+    elif score >= 0.7:
+        return "#ffcc66"  # Yellow - Good
+    elif score >= 0.5:
+        return "#aaddff"  # Blue - Somewhat Related
+    else:
+        return "#cccccc"  # Gray - Slightly Related
 def similarity(file1, file2):
     if file1 is None or file2 is None:
         return "Please upload both documents.", None, None
             error_msg += f"Document 2: {text2}"
         return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
+    overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)
+    # Filter to show only higher similarity pairs in detailed view (70%+)
+    high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
+    concept_groups = group_similar_concepts(high_similarity_pairs)
     # Prepare detailed output
     output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
+    if high_similarity_pairs:
+        output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"
         for concept, pairs in concept_groups.items():
             if pairs:
                 output_html += f"<h5>🔍 {concept}:</h5>"
                 for i, (sent1, sent2, score) in enumerate(pairs):
+                    color = get_similarity_color(score)
                     output_html += f"""
                     <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
                         <p><b>📄 Document 1:</b> {sent1}</p>
                     """
     else:
         output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
+        output_html += "<p>⚠️ No significant similarities found above the 70% threshold.</p>"
         output_html += "</div>"
+    # Generate bar chart showing ALL similarity levels
+    barchart_image = create_similarity_barchart(all_pairs)
+    summary_text = create_similarity_summary(overall_similarity, all_pairs)
     return output_html, summary_text, barchart_image
 # Create a clean Gradio interface
 with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 📄 Complete Document Similarity Analyzer
+    Upload two documents (PDF or DOCX) to compare their content across all similarity levels.
     """)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### Analysis Results")
             summary_output = gr.Markdown()
+            output_html = gr.HTML(label="Highly Similar Content (70%+)")
     gr.Markdown("""
+    ### 📊 Complete Similarity Distribution
     **Color Guide:**
+    - 🔴 Very Strong Similarity (90-100%) - Essentially identical content
+    - 🟡 Strong Similarity (80-89%) - Very similar with minor differences
+    - 🟠 Good Similarity (70-79%) - Related concepts with noticeable differences
+    - 🔵 Somewhat Related (50-69%) - Shared concepts but different focus
+    - ⚪ Slightly Related (30-49%) - Barely related topics
+    - ❌ Not Similar (0-29%) - Completely different content (not shown)
     """)
     barchart_display = gr.HTML()
         barchart_html = "<p>No similarity data available for visualization</p>"
         if barchart_img:
+            barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
         return result_html, summary_text, barchart_html
 # Launch the application
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)