NaimaAqeel commited on
Commit
714e663
Β·
verified Β·
1 Parent(s): c101099

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -42
app.py CHANGED
@@ -75,7 +75,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
75
  sentences2 = preprocess_text(doc2)
76
 
77
  if not sentences1 or not sentences2:
78
- return 0.0, []
79
 
80
  # Get embeddings for all sentences
81
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
@@ -83,6 +83,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
83
 
84
  # Calculate cosine similarities between all sentence pairs
85
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
 
86
 
87
  # Find the most similar sentences
88
  similar_pairs = []
@@ -93,20 +94,20 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
93
  best_match_idx = -1
94
 
95
  for j in range(len(sentences2)):
96
- if cosine_similarities[i][j] > max_similarity:
97
- max_similarity = cosine_similarities[i][j]
98
  best_match_idx = j
99
 
100
  if max_similarity > threshold and best_match_idx != -1:
101
- similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
102
 
103
  # Calculate overall similarity
104
- max_similarities1 = cosine_similarities.max(dim=1)[0]
105
- max_similarities2 = cosine_similarities.max(dim=0)[0]
106
- mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
107
- overall_similarity = mean_similarity.item()
108
 
109
- return overall_similarity, similar_pairs, cosine_similarities.cpu().numpy()
110
 
111
  def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
112
  """Create multiple visualizations for similarity analysis"""
@@ -115,24 +116,34 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
115
 
116
  visualizations = []
117
 
118
- # 1. Improved Heatmap
119
- plt.figure(figsize=(12, 10))
120
 
121
  # Create a mask for values below threshold to make the heatmap clearer
122
- mask = similarity_matrix < 0.5
123
 
124
- # Use a diverging color palette for better contrast
125
  ax = sns.heatmap(similarity_matrix,
126
  mask=mask,
127
- cmap='RdYlBu_r',
128
- center=0.7,
 
129
  xticklabels=False,
130
  yticklabels=False,
131
  cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
132
 
133
- plt.title('Document Similarity Heatmap\n(Brighter colors = Higher similarity)', fontsize=14, pad=20)
134
- plt.xlabel('Document 2 Sentences', fontsize=12)
135
- plt.ylabel('Document 1 Sentences', fontsize=12)
 
 
 
 
 
 
 
 
 
136
 
137
  buf = BytesIO()
138
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
@@ -142,18 +153,34 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
142
  visualizations.append(heatmap_img)
143
 
144
  # 2. Similarity Distribution Chart
145
- plt.figure(figsize=(10, 6))
146
 
147
  # Flatten the similarity matrix and filter out low similarities
148
  flat_similarities = similarity_matrix.flatten()
149
  flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
150
 
151
- plt.hist(flat_similarities, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
152
- plt.axvline(x=0.7, color='red', linestyle='--', label='Similarity Threshold (70%)')
153
- plt.xlabel('Similarity Score')
154
- plt.ylabel('Frequency')
155
- plt.title('Distribution of Sentence Similarities')
156
- plt.legend()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  plt.grid(True, alpha=0.3)
158
 
159
  buf = BytesIO()
@@ -164,10 +191,10 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
164
  visualizations.append(dist_img)
165
 
166
  # 3. Top Similarity Pairs Bar Chart
167
- plt.figure(figsize=(12, 8))
168
 
169
  # Get top similarity scores and their positions
170
- top_n = min(10, len(sentences1) * len(sentences2))
171
  if top_n > 0:
172
  # Flatten and get indices of top values
173
  flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
@@ -176,15 +203,36 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
176
  # Convert flat indices to 2D indices
177
  rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
178
 
179
- # Create labels
180
- labels = [f"Sent {r+1} ↔ Sent {c+1}" for r, c in zip(rows, cols)]
 
 
 
 
 
 
 
181
 
182
- plt.barh(range(len(top_scores)), top_scores, color='lightcoral')
183
- plt.yticks(range(len(top_scores)), labels)
184
- plt.xlabel('Similarity Score')
185
- plt.title('Top 10 Most Similar Sentence Pairs')
 
 
 
 
 
186
  plt.grid(True, alpha=0.3, axis='x')
187
 
 
 
 
 
 
 
 
 
 
188
  buf = BytesIO()
189
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
190
  plt.close()
@@ -199,7 +247,7 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
199
  def create_similarity_summary(overall_similarity, similar_pairs):
200
  """Create a text summary of the similarity analysis"""
201
  summary = f"## πŸ“Š Similarity Summary\n\n"
202
- summary += f"**Overall Similarity Score:** {overall_similarity:.2%}\n\n"
203
 
204
  if similar_pairs:
205
  summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
@@ -207,10 +255,12 @@ def create_similarity_summary(overall_similarity, similar_pairs):
207
  # Group by similarity ranges
208
  high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
209
  med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
 
210
 
211
  summary += "**Similarity Breakdown:**\n"
212
- summary += f"- High Similarity (β‰₯90%): {high_sim} pairs\n"
213
- summary += f"- Medium Similarity (70-89%): {med_sim} pairs\n\n"
 
214
 
215
  # Most common concepts
216
  concepts = {
@@ -240,7 +290,7 @@ def create_similarity_summary(overall_similarity, similar_pairs):
240
  if count > 0:
241
  summary += f"- {concept.capitalize()}: {count} pairs\n"
242
  else:
243
- summary += "No significant similarities found above the 70% threshold.\n"
244
 
245
  return summary
246
 
@@ -310,7 +360,7 @@ def similarity(file1, file2):
310
  if pairs:
311
  output_html += f"<h5>πŸ” {concept.capitalize()}:</h5>"
312
  for i, (sent1, sent2, score) in enumerate(pairs):
313
- color = "#4CAF50" if score >= 0.9 else "#FF9800" if score >= 0.7 else "#F44336"
314
  output_html += f"""
315
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
316
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
@@ -320,7 +370,7 @@ def similarity(file1, file2):
320
  """
321
  else:
322
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
323
- output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
324
  output_html += "</div>"
325
 
326
  # Generate visualizations
@@ -357,14 +407,27 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
357
 
358
  with gr.Row():
359
  with gr.Column():
360
- gr.Markdown("### πŸ“ˆ Similarity Heatmap")
 
 
 
 
 
 
361
  heatmap_display = gr.HTML()
362
  with gr.Column():
363
- gr.Markdown("### πŸ“Š Similarity Distribution")
 
 
 
 
364
  dist_display = gr.HTML()
365
 
366
  with gr.Row():
367
- gr.Markdown("### πŸ” Top Similar Pairs")
 
 
 
368
  top_pairs_display = gr.HTML()
369
 
370
  # Define the processing function
@@ -395,4 +458,5 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
395
  # Launch the application
396
  if __name__ == "__main__":
397
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
398
 
 
75
  sentences2 = preprocess_text(doc2)
76
 
77
  if not sentences1 or not sentences2:
78
+ return 0.0, [], np.array([])
79
 
80
  # Get embeddings for all sentences
81
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
 
83
 
84
  # Calculate cosine similarities between all sentence pairs
85
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
86
+ similarity_matrix = cosine_similarities.cpu().numpy()
87
 
88
  # Find the most similar sentences
89
  similar_pairs = []
 
94
  best_match_idx = -1
95
 
96
  for j in range(len(sentences2)):
97
+ if similarity_matrix[i][j] > max_similarity:
98
+ max_similarity = similarity_matrix[i][j]
99
  best_match_idx = j
100
 
101
  if max_similarity > threshold and best_match_idx != -1:
102
+ similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity))
103
 
104
  # Calculate overall similarity
105
+ max_similarities1 = np.max(similarity_matrix, axis=1)
106
+ max_similarities2 = np.max(similarity_matrix, axis=0)
107
+ mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
108
+ overall_similarity = mean_similarity
109
 
110
+ return overall_similarity, similar_pairs, similarity_matrix
111
 
112
  def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
113
  """Create multiple visualizations for similarity analysis"""
 
116
 
117
  visualizations = []
118
 
119
+ # 1. Improved Heatmap with clear explanation
120
+ plt.figure(figsize=(14, 10))
121
 
122
  # Create a mask for values below threshold to make the heatmap clearer
123
+ mask = similarity_matrix < 0.3
124
 
125
+ # Use a clear color palette
126
  ax = sns.heatmap(similarity_matrix,
127
  mask=mask,
128
+ cmap='YlOrRd',
129
+ vmin=0.3,
130
+ vmax=1.0,
131
  xticklabels=False,
132
  yticklabels=False,
133
  cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
134
 
135
+ plt.title('Document Similarity Heatmap\n\nπŸ”΄ Red = Very Similar 🟑 Yellow = Somewhat Similar βšͺ White = Not Similar',
136
+ fontsize=16, pad=20)
137
+ plt.xlabel('Document 2 Sentences', fontsize=14)
138
+ plt.ylabel('Document 1 Sentences', fontsize=14)
139
+
140
+ # Add explanation text
141
+ explanation_text = (
142
+ "This heatmap shows how similar each sentence in Document 1 is to each sentence in Document 2.\n"
143
+ "Bright red areas indicate very similar content, yellow areas show some similarity, \n"
144
+ "and white areas indicate little to no similarity."
145
+ )
146
+ plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
147
 
148
  buf = BytesIO()
149
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
 
153
  visualizations.append(heatmap_img)
154
 
155
  # 2. Similarity Distribution Chart
156
+ plt.figure(figsize=(12, 8))
157
 
158
  # Flatten the similarity matrix and filter out low similarities
159
  flat_similarities = similarity_matrix.flatten()
160
  flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
161
 
162
+ # Create bins with labels
163
+ bins = [0.3, 0.5, 0.7, 0.9, 1.0]
164
+ bin_labels = ['Low (30-50%)', 'Medium (50-70%)', 'High (70-90%)', 'Very High (90-100%)']
165
+
166
+ # Create histogram
167
+ counts, bin_edges = np.histogram(flat_similarities, bins=bins)
168
+
169
+ # Create bar chart with colors
170
+ colors = ['#ff9999', '#ffcc99', '#c2e699', '#66b3ff']
171
+ bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black')
172
+
173
+ # Add value labels on bars
174
+ for i, (count, bar) in enumerate(zip(counts, bars)):
175
+ plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
176
+ str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
177
+
178
+ plt.axvline(x=1.5, color='red', linestyle='--', linewidth=2, label='Similarity Threshold (70%)')
179
+ plt.xlabel('Similarity Level', fontsize=14)
180
+ plt.ylabel('Number of Sentence Pairs', fontsize=14)
181
+ plt.title('Distribution of Sentence Similarities', fontsize=16)
182
+ plt.xticks(range(len(bin_labels)), bin_labels, rotation=45, ha='right')
183
+ plt.legend(fontsize=12)
184
  plt.grid(True, alpha=0.3)
185
 
186
  buf = BytesIO()
 
191
  visualizations.append(dist_img)
192
 
193
  # 3. Top Similarity Pairs Bar Chart
194
+ plt.figure(figsize=(14, 10))
195
 
196
  # Get top similarity scores and their positions
197
+ top_n = min(8, len(sentences1) * len(sentences2))
198
  if top_n > 0:
199
  # Flatten and get indices of top values
200
  flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
 
203
  # Convert flat indices to 2D indices
204
  rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
205
 
206
+ # Create shortened labels for readability
207
+ labels = []
208
+ for r, c in zip(rows, cols):
209
+ sent1_short = sentences1[r][:50] + "..." if len(sentences1[r]) > 50 else sentences1[r]
210
+ sent2_short = sentences2[c][:50] + "..." if len(sentences2[c]) > 50 else sentences2[c]
211
+ labels.append(f"Pair {r+1}-{c+1}")
212
+
213
+ colors = ['#ff6666' if score >= 0.9 else '#ffcc66' if score >= 0.7 else '#66b3ff' for score in top_scores]
214
+ bars = plt.barh(range(len(top_scores)), top_scores, color=colors, edgecolor='black')
215
 
216
+ # Add value labels
217
+ for i, (score, bar) in enumerate(zip(top_scores, bars)):
218
+ plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
219
+ f'{score:.2%}', ha='left', va='center', fontsize=11, fontweight='bold')
220
+
221
+ plt.yticks(range(len(top_scores)), labels, fontsize=11)
222
+ plt.xlabel('Similarity Score', fontsize=14)
223
+ plt.title('Top 8 Most Similar Sentence Pairs', fontsize=16)
224
+ plt.xlim(0, 1.1)
225
  plt.grid(True, alpha=0.3, axis='x')
226
 
227
+ # Add legend for colors
228
+ from matplotlib.patches import Patch
229
+ legend_elements = [
230
+ Patch(facecolor='#ff6666', label='Very Similar (β‰₯90%)'),
231
+ Patch(facecolor='#ffcc66', label='Similar (70-89%)'),
232
+ Patch(facecolor='#66b3ff', label='Somewhat Similar (30-69%)')
233
+ ]
234
+ plt.legend(handles=legend_elements, loc='lower right')
235
+
236
  buf = BytesIO()
237
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
238
  plt.close()
 
247
  def create_similarity_summary(overall_similarity, similar_pairs):
248
  """Create a text summary of the similarity analysis"""
249
  summary = f"## πŸ“Š Similarity Summary\n\n"
250
+ summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
251
 
252
  if similar_pairs:
253
  summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
 
255
  # Group by similarity ranges
256
  high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
257
  med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
258
+ low_sim = len([p for p in similar_pairs if 0.3 <= p[2] < 0.7])
259
 
260
  summary += "**Similarity Breakdown:**\n"
261
+ summary += f"- πŸ”΄ Very High Similarity (β‰₯90%): {high_sim} pairs\n"
262
+ summary += f"- 🟑 High Similarity (70-89%): {med_sim} pairs\n"
263
+ summary += f"- πŸ”΅ Some Similarity (30-69%): {low_sim} pairs\n\n"
264
 
265
  # Most common concepts
266
  concepts = {
 
290
  if count > 0:
291
  summary += f"- {concept.capitalize()}: {count} pairs\n"
292
  else:
293
+ summary += "No significant similarities found above the 30% threshold.\n"
294
 
295
  return summary
296
 
 
360
  if pairs:
361
  output_html += f"<h5>πŸ” {concept.capitalize()}:</h5>"
362
  for i, (sent1, sent2, score) in enumerate(pairs):
363
+ color = "#ff6666" if score >= 0.9 else "#ffcc66" if score >= 0.7 else "#66b3ff"
364
  output_html += f"""
365
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
366
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
 
370
  """
371
  else:
372
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
373
+ output_html += "<p>⚠️ No significant similarities found above the threshold (30%).</p>"
374
  output_html += "</div>"
375
 
376
  # Generate visualizations
 
407
 
408
  with gr.Row():
409
  with gr.Column():
410
+ gr.Markdown("""
411
+ ### πŸ“ˆ Similarity Heatmap
412
+ **Color Guide:**
413
+ - πŸ”΄ Red = Very Similar (90-100%)
414
+ - 🟑 Yellow = Somewhat Similar (70-89%)
415
+ - βšͺ White = Not Similar (0-69%)
416
+ """)
417
  heatmap_display = gr.HTML()
418
  with gr.Column():
419
+ gr.Markdown("""
420
+ ### πŸ“Š Similarity Distribution
421
+ Shows how many sentence pairs fall into each similarity range.
422
+ The red line indicates the 70% similarity threshold.
423
+ """)
424
  dist_display = gr.HTML()
425
 
426
  with gr.Row():
427
+ gr.Markdown("""
428
+ ### πŸ” Top Similar Pairs
429
+ The most similar sentences between your documents, with similarity scores.
430
+ """)
431
  top_pairs_display = gr.HTML()
432
 
433
  # Define the processing function
 
458
  # Launch the application
459
  if __name__ == "__main__":
460
  demo.launch(server_name="0.0.0.0", server_port=7860)
461
+
462