Update app.py
Browse files
app.py
CHANGED
@@ -75,7 +75,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
75 |
sentences2 = preprocess_text(doc2)
|
76 |
|
77 |
if not sentences1 or not sentences2:
|
78 |
-
return 0.0, []
|
79 |
|
80 |
# Get embeddings for all sentences
|
81 |
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
@@ -83,6 +83,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
83 |
|
84 |
# Calculate cosine similarities between all sentence pairs
|
85 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
|
|
86 |
|
87 |
# Find the most similar sentences
|
88 |
similar_pairs = []
|
@@ -93,20 +94,20 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
93 |
best_match_idx = -1
|
94 |
|
95 |
for j in range(len(sentences2)):
|
96 |
-
if
|
97 |
-
max_similarity =
|
98 |
best_match_idx = j
|
99 |
|
100 |
if max_similarity > threshold and best_match_idx != -1:
|
101 |
-
similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity
|
102 |
|
103 |
# Calculate overall similarity
|
104 |
-
max_similarities1 =
|
105 |
-
max_similarities2 =
|
106 |
-
mean_similarity = (
|
107 |
-
overall_similarity = mean_similarity
|
108 |
|
109 |
-
return overall_similarity, similar_pairs,
|
110 |
|
111 |
def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
112 |
"""Create multiple visualizations for similarity analysis"""
|
@@ -115,24 +116,34 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
|
115 |
|
116 |
visualizations = []
|
117 |
|
118 |
-
# 1. Improved Heatmap
|
119 |
-
plt.figure(figsize=(
|
120 |
|
121 |
# Create a mask for values below threshold to make the heatmap clearer
|
122 |
-
mask = similarity_matrix < 0.
|
123 |
|
124 |
-
# Use a
|
125 |
ax = sns.heatmap(similarity_matrix,
|
126 |
mask=mask,
|
127 |
-
cmap='
|
128 |
-
|
|
|
129 |
xticklabels=False,
|
130 |
yticklabels=False,
|
131 |
cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
|
132 |
|
133 |
-
plt.title('Document Similarity Heatmap\n
|
134 |
-
|
135 |
-
plt.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
buf = BytesIO()
|
138 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
@@ -142,18 +153,34 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
|
142 |
visualizations.append(heatmap_img)
|
143 |
|
144 |
# 2. Similarity Distribution Chart
|
145 |
-
plt.figure(figsize=(
|
146 |
|
147 |
# Flatten the similarity matrix and filter out low similarities
|
148 |
flat_similarities = similarity_matrix.flatten()
|
149 |
flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
plt.grid(True, alpha=0.3)
|
158 |
|
159 |
buf = BytesIO()
|
@@ -164,10 +191,10 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
|
164 |
visualizations.append(dist_img)
|
165 |
|
166 |
# 3. Top Similarity Pairs Bar Chart
|
167 |
-
plt.figure(figsize=(
|
168 |
|
169 |
# Get top similarity scores and their positions
|
170 |
-
top_n = min(
|
171 |
if top_n > 0:
|
172 |
# Flatten and get indices of top values
|
173 |
flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
|
@@ -176,15 +203,36 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
|
176 |
# Convert flat indices to 2D indices
|
177 |
rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
|
178 |
|
179 |
-
# Create labels
|
180 |
-
labels = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
186 |
plt.grid(True, alpha=0.3, axis='x')
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
buf = BytesIO()
|
189 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
190 |
plt.close()
|
@@ -199,7 +247,7 @@ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
|
199 |
def create_similarity_summary(overall_similarity, similar_pairs):
|
200 |
"""Create a text summary of the similarity analysis"""
|
201 |
summary = f"## π Similarity Summary\n\n"
|
202 |
-
summary += f"**Overall Similarity Score:** {overall_similarity:.2%}
|
203 |
|
204 |
if similar_pairs:
|
205 |
summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
|
@@ -207,10 +255,12 @@ def create_similarity_summary(overall_similarity, similar_pairs):
|
|
207 |
# Group by similarity ranges
|
208 |
high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
|
209 |
med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
|
|
|
210 |
|
211 |
summary += "**Similarity Breakdown:**\n"
|
212 |
-
summary += f"- High Similarity (β₯90%): {high_sim} pairs\n"
|
213 |
-
summary += f"-
|
|
|
214 |
|
215 |
# Most common concepts
|
216 |
concepts = {
|
@@ -240,7 +290,7 @@ def create_similarity_summary(overall_similarity, similar_pairs):
|
|
240 |
if count > 0:
|
241 |
summary += f"- {concept.capitalize()}: {count} pairs\n"
|
242 |
else:
|
243 |
-
summary += "No significant similarities found above the
|
244 |
|
245 |
return summary
|
246 |
|
@@ -310,7 +360,7 @@ def similarity(file1, file2):
|
|
310 |
if pairs:
|
311 |
output_html += f"<h5>π {concept.capitalize()}:</h5>"
|
312 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
313 |
-
color = "#
|
314 |
output_html += f"""
|
315 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
316 |
<p><b>π Document 1:</b> {sent1}</p>
|
@@ -320,7 +370,7 @@ def similarity(file1, file2):
|
|
320 |
"""
|
321 |
else:
|
322 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
323 |
-
output_html += "<p>β οΈ No significant similarities found above the threshold (
|
324 |
output_html += "</div>"
|
325 |
|
326 |
# Generate visualizations
|
@@ -357,14 +407,27 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
357 |
|
358 |
with gr.Row():
|
359 |
with gr.Column():
|
360 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
heatmap_display = gr.HTML()
|
362 |
with gr.Column():
|
363 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
364 |
dist_display = gr.HTML()
|
365 |
|
366 |
with gr.Row():
|
367 |
-
gr.Markdown("
|
|
|
|
|
|
|
368 |
top_pairs_display = gr.HTML()
|
369 |
|
370 |
# Define the processing function
|
@@ -395,4 +458,5 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
395 |
# Launch the application
|
396 |
if __name__ == "__main__":
|
397 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
398 |
|
|
|
75 |
sentences2 = preprocess_text(doc2)
|
76 |
|
77 |
if not sentences1 or not sentences2:
|
78 |
+
return 0.0, [], np.array([])
|
79 |
|
80 |
# Get embeddings for all sentences
|
81 |
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
|
|
83 |
|
84 |
# Calculate cosine similarities between all sentence pairs
|
85 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
86 |
+
similarity_matrix = cosine_similarities.cpu().numpy()
|
87 |
|
88 |
# Find the most similar sentences
|
89 |
similar_pairs = []
|
|
|
94 |
best_match_idx = -1
|
95 |
|
96 |
for j in range(len(sentences2)):
|
97 |
+
if similarity_matrix[i][j] > max_similarity:
|
98 |
+
max_similarity = similarity_matrix[i][j]
|
99 |
best_match_idx = j
|
100 |
|
101 |
if max_similarity > threshold and best_match_idx != -1:
|
102 |
+
similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity))
|
103 |
|
104 |
# Calculate overall similarity
|
105 |
+
max_similarities1 = np.max(similarity_matrix, axis=1)
|
106 |
+
max_similarities2 = np.max(similarity_matrix, axis=0)
|
107 |
+
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
|
108 |
+
overall_similarity = mean_similarity
|
109 |
|
110 |
+
return overall_similarity, similar_pairs, similarity_matrix
|
111 |
|
112 |
def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
|
113 |
"""Create multiple visualizations for similarity analysis"""
|
|
|
116 |
|
117 |
visualizations = []
|
118 |
|
119 |
+
# 1. Improved Heatmap with clear explanation
|
120 |
+
plt.figure(figsize=(14, 10))
|
121 |
|
122 |
# Create a mask for values below threshold to make the heatmap clearer
|
123 |
+
mask = similarity_matrix < 0.3
|
124 |
|
125 |
+
# Use a clear color palette
|
126 |
ax = sns.heatmap(similarity_matrix,
|
127 |
mask=mask,
|
128 |
+
cmap='YlOrRd',
|
129 |
+
vmin=0.3,
|
130 |
+
vmax=1.0,
|
131 |
xticklabels=False,
|
132 |
yticklabels=False,
|
133 |
cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
|
134 |
|
135 |
+
plt.title('Document Similarity Heatmap\n\nπ΄ Red = Very Similar π‘ Yellow = Somewhat Similar βͺ White = Not Similar',
|
136 |
+
fontsize=16, pad=20)
|
137 |
+
plt.xlabel('Document 2 Sentences', fontsize=14)
|
138 |
+
plt.ylabel('Document 1 Sentences', fontsize=14)
|
139 |
+
|
140 |
+
# Add explanation text
|
141 |
+
explanation_text = (
|
142 |
+
"This heatmap shows how similar each sentence in Document 1 is to each sentence in Document 2.\n"
|
143 |
+
"Bright red areas indicate very similar content, yellow areas show some similarity, \n"
|
144 |
+
"and white areas indicate little to no similarity."
|
145 |
+
)
|
146 |
+
plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
|
147 |
|
148 |
buf = BytesIO()
|
149 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
|
|
153 |
visualizations.append(heatmap_img)
|
154 |
|
155 |
# 2. Similarity Distribution Chart
|
156 |
+
plt.figure(figsize=(12, 8))
|
157 |
|
158 |
# Flatten the similarity matrix and filter out low similarities
|
159 |
flat_similarities = similarity_matrix.flatten()
|
160 |
flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
|
161 |
|
162 |
+
# Create bins with labels
|
163 |
+
bins = [0.3, 0.5, 0.7, 0.9, 1.0]
|
164 |
+
bin_labels = ['Low (30-50%)', 'Medium (50-70%)', 'High (70-90%)', 'Very High (90-100%)']
|
165 |
+
|
166 |
+
# Create histogram
|
167 |
+
counts, bin_edges = np.histogram(flat_similarities, bins=bins)
|
168 |
+
|
169 |
+
# Create bar chart with colors
|
170 |
+
colors = ['#ff9999', '#ffcc99', '#c2e699', '#66b3ff']
|
171 |
+
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black')
|
172 |
+
|
173 |
+
# Add value labels on bars
|
174 |
+
for i, (count, bar) in enumerate(zip(counts, bars)):
|
175 |
+
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
|
176 |
+
str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
|
177 |
+
|
178 |
+
plt.axvline(x=1.5, color='red', linestyle='--', linewidth=2, label='Similarity Threshold (70%)')
|
179 |
+
plt.xlabel('Similarity Level', fontsize=14)
|
180 |
+
plt.ylabel('Number of Sentence Pairs', fontsize=14)
|
181 |
+
plt.title('Distribution of Sentence Similarities', fontsize=16)
|
182 |
+
plt.xticks(range(len(bin_labels)), bin_labels, rotation=45, ha='right')
|
183 |
+
plt.legend(fontsize=12)
|
184 |
plt.grid(True, alpha=0.3)
|
185 |
|
186 |
buf = BytesIO()
|
|
|
191 |
visualizations.append(dist_img)
|
192 |
|
193 |
# 3. Top Similarity Pairs Bar Chart
|
194 |
+
plt.figure(figsize=(14, 10))
|
195 |
|
196 |
# Get top similarity scores and their positions
|
197 |
+
top_n = min(8, len(sentences1) * len(sentences2))
|
198 |
if top_n > 0:
|
199 |
# Flatten and get indices of top values
|
200 |
flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
|
|
|
203 |
# Convert flat indices to 2D indices
|
204 |
rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
|
205 |
|
206 |
+
# Create shortened labels for readability
|
207 |
+
labels = []
|
208 |
+
for r, c in zip(rows, cols):
|
209 |
+
sent1_short = sentences1[r][:50] + "..." if len(sentences1[r]) > 50 else sentences1[r]
|
210 |
+
sent2_short = sentences2[c][:50] + "..." if len(sentences2[c]) > 50 else sentences2[c]
|
211 |
+
labels.append(f"Pair {r+1}-{c+1}")
|
212 |
+
|
213 |
+
colors = ['#ff6666' if score >= 0.9 else '#ffcc66' if score >= 0.7 else '#66b3ff' for score in top_scores]
|
214 |
+
bars = plt.barh(range(len(top_scores)), top_scores, color=colors, edgecolor='black')
|
215 |
|
216 |
+
# Add value labels
|
217 |
+
for i, (score, bar) in enumerate(zip(top_scores, bars)):
|
218 |
+
plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
|
219 |
+
f'{score:.2%}', ha='left', va='center', fontsize=11, fontweight='bold')
|
220 |
+
|
221 |
+
plt.yticks(range(len(top_scores)), labels, fontsize=11)
|
222 |
+
plt.xlabel('Similarity Score', fontsize=14)
|
223 |
+
plt.title('Top 8 Most Similar Sentence Pairs', fontsize=16)
|
224 |
+
plt.xlim(0, 1.1)
|
225 |
plt.grid(True, alpha=0.3, axis='x')
|
226 |
|
227 |
+
# Add legend for colors
|
228 |
+
from matplotlib.patches import Patch
|
229 |
+
legend_elements = [
|
230 |
+
Patch(facecolor='#ff6666', label='Very Similar (β₯90%)'),
|
231 |
+
Patch(facecolor='#ffcc66', label='Similar (70-89%)'),
|
232 |
+
Patch(facecolor='#66b3ff', label='Somewhat Similar (30-69%)')
|
233 |
+
]
|
234 |
+
plt.legend(handles=legend_elements, loc='lower right')
|
235 |
+
|
236 |
buf = BytesIO()
|
237 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
238 |
plt.close()
|
|
|
247 |
def create_similarity_summary(overall_similarity, similar_pairs):
|
248 |
"""Create a text summary of the similarity analysis"""
|
249 |
summary = f"## π Similarity Summary\n\n"
|
250 |
+
summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
|
251 |
|
252 |
if similar_pairs:
|
253 |
summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
|
|
|
255 |
# Group by similarity ranges
|
256 |
high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
|
257 |
med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
|
258 |
+
low_sim = len([p for p in similar_pairs if 0.3 <= p[2] < 0.7])
|
259 |
|
260 |
summary += "**Similarity Breakdown:**\n"
|
261 |
+
summary += f"- π΄ Very High Similarity (β₯90%): {high_sim} pairs\n"
|
262 |
+
summary += f"- π‘ High Similarity (70-89%): {med_sim} pairs\n"
|
263 |
+
summary += f"- π΅ Some Similarity (30-69%): {low_sim} pairs\n\n"
|
264 |
|
265 |
# Most common concepts
|
266 |
concepts = {
|
|
|
290 |
if count > 0:
|
291 |
summary += f"- {concept.capitalize()}: {count} pairs\n"
|
292 |
else:
|
293 |
+
summary += "No significant similarities found above the 30% threshold.\n"
|
294 |
|
295 |
return summary
|
296 |
|
|
|
360 |
if pairs:
|
361 |
output_html += f"<h5>π {concept.capitalize()}:</h5>"
|
362 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
363 |
+
color = "#ff6666" if score >= 0.9 else "#ffcc66" if score >= 0.7 else "#66b3ff"
|
364 |
output_html += f"""
|
365 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
366 |
<p><b>π Document 1:</b> {sent1}</p>
|
|
|
370 |
"""
|
371 |
else:
|
372 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
373 |
+
output_html += "<p>β οΈ No significant similarities found above the threshold (30%).</p>"
|
374 |
output_html += "</div>"
|
375 |
|
376 |
# Generate visualizations
|
|
|
407 |
|
408 |
with gr.Row():
|
409 |
with gr.Column():
|
410 |
+
gr.Markdown("""
|
411 |
+
### π Similarity Heatmap
|
412 |
+
**Color Guide:**
|
413 |
+
- π΄ Red = Very Similar (90-100%)
|
414 |
+
- π‘ Yellow = Somewhat Similar (70-89%)
|
415 |
+
- βͺ White = Not Similar (0-69%)
|
416 |
+
""")
|
417 |
heatmap_display = gr.HTML()
|
418 |
with gr.Column():
|
419 |
+
gr.Markdown("""
|
420 |
+
### π Similarity Distribution
|
421 |
+
Shows how many sentence pairs fall into each similarity range.
|
422 |
+
The red line indicates the 70% similarity threshold.
|
423 |
+
""")
|
424 |
dist_display = gr.HTML()
|
425 |
|
426 |
with gr.Row():
|
427 |
+
gr.Markdown("""
|
428 |
+
### π Top Similar Pairs
|
429 |
+
The most similar sentences between your documents, with similarity scores.
|
430 |
+
""")
|
431 |
top_pairs_display = gr.HTML()
|
432 |
|
433 |
# Define the processing function
|
|
|
458 |
# Launch the application
|
459 |
if __name__ == "__main__":
|
460 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
461 |
+
|
462 |
|