NaimaAqeel commited on
Commit
28ea54b
Β·
verified Β·
1 Parent(s): 092f11f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -93
app.py CHANGED
@@ -83,21 +83,17 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
83
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
84
  similarity_matrix = cosine_similarities.cpu().numpy()
85
 
86
- # Find the most similar sentences
87
- similar_pairs = []
88
- threshold = 0.7 # Similarity threshold for highlighting
89
 
90
  for i in range(len(sentences1)):
91
- max_similarity = 0
92
- best_match_idx = -1
93
-
94
  for j in range(len(sentences2)):
95
- if similarity_matrix[i][j] > max_similarity:
96
- max_similarity = similarity_matrix[i][j]
97
- best_match_idx = j
98
-
99
- if max_similarity > threshold and best_match_idx != -1:
100
- similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity))
101
 
102
  # Calculate overall similarity
103
  max_similarities1 = np.max(similarity_matrix, axis=1)
@@ -105,38 +101,45 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
105
  mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
106
  overall_similarity = mean_similarity
107
 
108
- return overall_similarity, similar_pairs
109
 
110
- def create_similarity_barchart(similar_pairs):
111
- """Create a bar chart showing similarity distribution"""
112
- if not similar_pairs:
113
  return None
114
 
115
- plt.figure(figsize=(12, 8))
116
 
117
  # Extract similarity scores
118
- scores = [pair[2] for pair in similar_pairs]
119
-
120
- # Create bins with labels
121
- bins = [0.7, 0.8, 0.9, 1.0]
122
- bin_labels = ['Good (70-79%)', 'Strong (80-89%)', 'Very Strong (90-100%)']
 
 
 
 
 
 
123
 
124
  # Count pairs in each bin
125
  counts, _ = np.histogram(scores, bins=bins)
126
 
127
- # Create bar chart with colors
128
- colors = ['#ffcc66', '#ffaa44', '#ff6666']
129
- bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.6)
130
 
131
  # Add value labels on bars
132
  for i, (count, bar) in enumerate(zip(counts, bars)):
133
- plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
134
- str(count), ha='center', va='bottom', fontsize=14, fontweight='bold')
 
135
 
136
  plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
137
  plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
138
- plt.title('Document Similarity Distribution', fontsize=16, fontweight='bold', pad=20)
139
- plt.xticks(range(len(bin_labels)), bin_labels, fontsize=12)
140
 
141
  # Remove top and right spines
142
  plt.gca().spines['top'].set_visible(False)
@@ -146,10 +149,12 @@ def create_similarity_barchart(similar_pairs):
146
  plt.grid(axis='y', alpha=0.3)
147
 
148
  # Add explanation
149
- plt.figtext(0.5, 0.01,
150
- "This chart shows how many sentence pairs fall into each similarity range.\n"
151
- "Higher bars indicate more content shared between documents at that similarity level.",
152
- ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
 
 
153
 
154
  buf = BytesIO()
155
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
@@ -158,57 +163,62 @@ def create_similarity_barchart(similar_pairs):
158
 
159
  return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
160
 
161
- def create_similarity_summary(overall_similarity, similar_pairs):
162
  """Create a text summary of the similarity analysis"""
163
- summary = f"## πŸ“Š Similarity Summary\n\n"
164
  summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
165
 
166
- if similar_pairs:
167
- summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
168
-
169
- # Group by similarity ranges
170
- high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
171
- med_sim = len([p for p in similar_pairs if 0.8 <= p[2] < 0.9])
172
- low_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.8])
173
 
174
  summary += "**Similarity Breakdown:**\n"
175
- summary += f"- πŸ”΄ Very Strong Similarity (90-100%): {high_sim} pairs\n"
176
- summary += f"- 🟑 Strong Similarity (80-89%): {med_sim} pairs\n"
177
- summary += f"- 🟠 Good Similarity (70-79%): {low_sim} pairs\n\n"
178
-
179
- # Most common concepts
180
- concepts = {
181
- 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
182
- 'Education': ['education', 'learn', 'course', 'degree', 'academic'],
183
- 'Experience': ['experience', 'work', 'job', 'intern', 'position'],
184
- 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
185
- 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
186
- }
187
-
188
- concept_counts = {concept: 0 for concept in concepts.keys()}
189
- concept_counts['Other'] = 0
190
 
191
- for sent1, sent2, score in similar_pairs:
192
- matched = False
193
- for concept, keywords in concepts.items():
194
- if any(keyword in sent1.lower() for keyword in keywords) or \
195
- any(keyword in sent2.lower() for keyword in keywords):
196
- concept_counts[concept] += 1
197
- matched = True
198
- break
199
- if not matched:
200
- concept_counts['Other'] += 1
201
-
202
- summary += "**Similar Content by Category:**\n"
203
- for concept, count in concept_counts.items():
204
- if count > 0:
205
- summary += f"- {concept}: {count} pairs\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  else:
207
- summary += "No significant similarities found above the 70% threshold.\n"
208
 
209
  return summary
210
 
211
- def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
212
  """Group similar sentences by concept using keyword extraction"""
213
  concept_groups = defaultdict(list)
214
 
@@ -220,7 +230,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
220
  'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
221
  }
222
 
223
- for sent1, sent2, score in similar_pairs:
224
  matched_concept = 'Other'
225
  for concept, keywords in concepts.items():
226
  if any(keyword in sent1.lower() for keyword in keywords) or \
@@ -231,6 +241,19 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
231
 
232
  return concept_groups
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def similarity(file1, file2):
235
  if file1 is None or file2 is None:
236
  return "Please upload both documents.", None, None
@@ -260,21 +283,23 @@ def similarity(file1, file2):
260
  error_msg += f"Document 2: {text2}"
261
  return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
262
 
263
- overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
264
 
265
- concept_groups = group_similar_concepts(similar_pairs)
 
 
266
 
267
  # Prepare detailed output
268
  output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
269
 
270
- if similar_pairs:
271
- output_html += f"<h4>Found {len(similar_pairs)} similar sentence pairs:</h4>"
272
 
273
  for concept, pairs in concept_groups.items():
274
  if pairs:
275
  output_html += f"<h5>πŸ” {concept}:</h5>"
276
  for i, (sent1, sent2, score) in enumerate(pairs):
277
- color = "#ff6666" if score >= 0.9 else "#ffaa44" if score >= 0.8 else "#ffcc66"
278
  output_html += f"""
279
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
280
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
@@ -284,20 +309,20 @@ def similarity(file1, file2):
284
  """
285
  else:
286
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
287
- output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
288
  output_html += "</div>"
289
 
290
- # Generate bar chart
291
- barchart_image = create_similarity_barchart(similar_pairs)
292
- summary_text = create_similarity_summary(overall_similarity, similar_pairs)
293
 
294
  return output_html, summary_text, barchart_image
295
 
296
  # Create a clean Gradio interface
297
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
298
  gr.Markdown("""
299
- # πŸ“„ Document Similarity Checker
300
- Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
301
  """)
302
 
303
  with gr.Row():
@@ -310,14 +335,18 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
310
  with gr.Column(scale=2):
311
  gr.Markdown("### Analysis Results")
312
  summary_output = gr.Markdown()
313
- output_html = gr.HTML(label="Detailed Similarities")
314
 
315
  gr.Markdown("""
316
- ### πŸ“Š Similarity Distribution
 
317
  **Color Guide:**
318
- - πŸ”΄ Very Strong Similarity (90-100%)
319
- - 🟑 Strong Similarity (80-89%)
320
- - 🟠 Good Similarity (70-79%)
 
 
 
321
  """)
322
  barchart_display = gr.HTML()
323
 
@@ -327,7 +356,7 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
327
 
328
  barchart_html = "<p>No similarity data available for visualization</p>"
329
  if barchart_img:
330
- barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
331
 
332
  return result_html, summary_text, barchart_html
333
 
@@ -341,6 +370,5 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
341
  # Launch the application
342
  if __name__ == "__main__":
343
  demo.launch(server_name="0.0.0.0", server_port=7860)
344
-
345
 
346
 
 
83
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
84
  similarity_matrix = cosine_similarities.cpu().numpy()
85
 
86
+ # Find the most similar sentences (all pairs for comprehensive analysis)
87
+ all_pairs = []
 
88
 
89
  for i in range(len(sentences1)):
 
 
 
90
  for j in range(len(sentences2)):
91
+ similarity_score = similarity_matrix[i][j]
92
+ if similarity_score > 0.3: # Include even lower similarities for comprehensive analysis
93
+ all_pairs.append((sentences1[i], sentences2[j], similarity_score))
94
+
95
+ # Sort by similarity score (highest first)
96
+ all_pairs.sort(key=lambda x: x[2], reverse=True)
97
 
98
  # Calculate overall similarity
99
  max_similarities1 = np.max(similarity_matrix, axis=1)
 
101
  mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
102
  overall_similarity = mean_similarity
103
 
104
+ return overall_similarity, all_pairs
105
 
106
+ def create_similarity_barchart(all_pairs):
107
+ """Create a bar chart showing similarity distribution across all levels"""
108
+ if not all_pairs:
109
  return None
110
 
111
+ plt.figure(figsize=(14, 8))
112
 
113
  # Extract similarity scores
114
+ scores = [pair[2] for pair in all_pairs]
115
+
116
+ # Create bins for all similarity levels
117
+ bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
118
+ bin_labels = [
119
+ 'Slightly Related\n(30-49%)',
120
+ 'Somewhat Related\n(50-69%)',
121
+ 'Good Similarity\n(70-79%)',
122
+ 'Strong Similarity\n(80-89%)',
123
+ 'Very Strong Similarity\n(90-100%)'
124
+ ]
125
 
126
  # Count pairs in each bin
127
  counts, _ = np.histogram(scores, bins=bins)
128
 
129
+ # Create bar chart with colors for all levels
130
+ colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
131
+ bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)
132
 
133
  # Add value labels on bars
134
  for i, (count, bar) in enumerate(zip(counts, bars)):
135
+ if count > 0:
136
+ plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
137
+ str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
138
 
139
  plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
140
  plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
141
+ plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
142
+ plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)
143
 
144
  # Remove top and right spines
145
  plt.gca().spines['top'].set_visible(False)
 
149
  plt.grid(axis='y', alpha=0.3)
150
 
151
  # Add explanation
152
+ explanation_text = (
153
+ "This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
154
+ "Pairs with less than 30% similarity are not shown as they are considered not similar."
155
+ )
156
+ plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic',
157
+ bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
158
 
159
  buf = BytesIO()
160
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
 
163
 
164
  return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
165
 
166
+ def create_similarity_summary(overall_similarity, all_pairs):
167
  """Create a text summary of the similarity analysis"""
168
+ summary = f"## πŸ“Š Complete Similarity Analysis\n\n"
169
  summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
170
 
171
+ if all_pairs:
172
+ # Count pairs in each category
173
+ very_strong = len([p for p in all_pairs if p[2] >= 0.9])
174
+ strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
175
+ good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
176
+ somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
177
+ slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])
178
 
179
  summary += "**Similarity Breakdown:**\n"
180
+ summary += f"- πŸ”΄ Very Strong Similarity (90-100%): {very_strong} pairs\n"
181
+ summary += f"- 🟑 Strong Similarity (80-89%): {strong} pairs\n"
182
+ summary += f"- 🟠 Good Similarity (70-79%): {good} pairs\n"
183
+ summary += f"- πŸ”΅ Somewhat Related (50-69%): {somewhat_related} pairs\n"
184
+ summary += f"- βšͺ Slightly Related (30-49%): {slightly_related} pairs\n"
185
+ summary += f"- ❌ Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"
 
 
 
 
 
 
 
 
 
186
 
187
+ # Most common concepts in higher similarity pairs
188
+ high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
189
+ if high_similarity_pairs:
190
+ concepts = {
191
+ 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
192
+ 'Education': ['education', 'learn', 'course', 'degree', 'academic'],
193
+ 'Experience': ['experience', 'work', 'job', 'intern', 'position'],
194
+ 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
195
+ 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
196
+ }
197
+
198
+ concept_counts = {concept: 0 for concept in concepts.keys()}
199
+ concept_counts['Other'] = 0
200
+
201
+ for sent1, sent2, score in high_similarity_pairs:
202
+ matched = False
203
+ for concept, keywords in concepts.items():
204
+ if any(keyword in sent1.lower() for keyword in keywords) or \
205
+ any(keyword in sent2.lower() for keyword in keywords):
206
+ concept_counts[concept] += 1
207
+ matched = True
208
+ break
209
+ if not matched:
210
+ concept_counts['Other'] += 1
211
+
212
+ summary += "**Highly Similar Content by Category:**\n"
213
+ for concept, count in concept_counts.items():
214
+ if count > 0:
215
+ summary += f"- {concept}: {count} pairs\n"
216
  else:
217
+ summary += "No significant similarities found above the 30% threshold.\n"
218
 
219
  return summary
220
 
221
+ def group_similar_concepts(all_pairs):
222
  """Group similar sentences by concept using keyword extraction"""
223
  concept_groups = defaultdict(list)
224
 
 
230
  'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
231
  }
232
 
233
+ for sent1, sent2, score in all_pairs:
234
  matched_concept = 'Other'
235
  for concept, keywords in concepts.items():
236
  if any(keyword in sent1.lower() for keyword in keywords) or \
 
241
 
242
  return concept_groups
243
 
244
+ def get_similarity_color(score):
245
+ """Get color based on similarity score"""
246
+ if score >= 0.9:
247
+ return "#ff6666" # Red - Very Strong
248
+ elif score >= 0.8:
249
+ return "#ffaa44" # Orange - Strong
250
+ elif score >= 0.7:
251
+ return "#ffcc66" # Yellow - Good
252
+ elif score >= 0.5:
253
+ return "#aaddff" # Blue - Somewhat Related
254
+ else:
255
+ return "#cccccc" # Gray - Slightly Related
256
+
257
  def similarity(file1, file2):
258
  if file1 is None or file2 is None:
259
  return "Please upload both documents.", None, None
 
283
  error_msg += f"Document 2: {text2}"
284
  return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
285
 
286
+ overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)
287
 
288
+ # Filter to show only higher similarity pairs in detailed view (70%+)
289
+ high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
290
+ concept_groups = group_similar_concepts(high_similarity_pairs)
291
 
292
  # Prepare detailed output
293
  output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
294
 
295
+ if high_similarity_pairs:
296
+ output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"
297
 
298
  for concept, pairs in concept_groups.items():
299
  if pairs:
300
  output_html += f"<h5>πŸ” {concept}:</h5>"
301
  for i, (sent1, sent2, score) in enumerate(pairs):
302
+ color = get_similarity_color(score)
303
  output_html += f"""
304
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
305
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
 
309
  """
310
  else:
311
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
312
+ output_html += "<p>⚠️ No significant similarities found above the 70% threshold.</p>"
313
  output_html += "</div>"
314
 
315
+ # Generate bar chart showing ALL similarity levels
316
+ barchart_image = create_similarity_barchart(all_pairs)
317
+ summary_text = create_similarity_summary(overall_similarity, all_pairs)
318
 
319
  return output_html, summary_text, barchart_image
320
 
321
  # Create a clean Gradio interface
322
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
323
  gr.Markdown("""
324
+ # πŸ“„ Complete Document Similarity Analyzer
325
+ Upload two documents (PDF or DOCX) to compare their content across all similarity levels.
326
  """)
327
 
328
  with gr.Row():
 
335
  with gr.Column(scale=2):
336
  gr.Markdown("### Analysis Results")
337
  summary_output = gr.Markdown()
338
+ output_html = gr.HTML(label="Highly Similar Content (70%+)")
339
 
340
  gr.Markdown("""
341
+ ### πŸ“Š Complete Similarity Distribution
342
+
343
  **Color Guide:**
344
+ - πŸ”΄ Very Strong Similarity (90-100%) - Essentially identical content
345
+ - 🟑 Strong Similarity (80-89%) - Very similar with minor differences
346
+ - 🟠 Good Similarity (70-79%) - Related concepts with noticeable differences
347
+ - πŸ”΅ Somewhat Related (50-69%) - Shared concepts but different focus
348
+ - βšͺ Slightly Related (30-49%) - Barely related topics
349
+ - ❌ Not Similar (0-29%) - Completely different content (not shown)
350
  """)
351
  barchart_display = gr.HTML()
352
 
 
356
 
357
  barchart_html = "<p>No similarity data available for visualization</p>"
358
  if barchart_img:
359
+ barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
360
 
361
  return result_html, summary_text, barchart_html
362
 
 
370
  # Launch the application
371
  if __name__ == "__main__":
372
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
373
 
374