Update app.py
Browse files
app.py
CHANGED
@@ -83,21 +83,17 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
83 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
84 |
similarity_matrix = cosine_similarities.cpu().numpy()
|
85 |
|
86 |
-
# Find the most similar sentences
|
87 |
-
|
88 |
-
threshold = 0.7 # Similarity threshold for highlighting
|
89 |
|
90 |
for i in range(len(sentences1)):
|
91 |
-
max_similarity = 0
|
92 |
-
best_match_idx = -1
|
93 |
-
|
94 |
for j in range(len(sentences2)):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
# Calculate overall similarity
|
103 |
max_similarities1 = np.max(similarity_matrix, axis=1)
|
@@ -105,38 +101,45 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
105 |
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
|
106 |
overall_similarity = mean_similarity
|
107 |
|
108 |
-
return overall_similarity,
|
109 |
|
110 |
-
def create_similarity_barchart(
|
111 |
-
"""Create a bar chart showing similarity distribution"""
|
112 |
-
if not
|
113 |
return None
|
114 |
|
115 |
-
plt.figure(figsize=(
|
116 |
|
117 |
# Extract similarity scores
|
118 |
-
scores = [pair[2] for pair in
|
119 |
-
|
120 |
-
# Create bins
|
121 |
-
bins = [0.7, 0.8, 0.9, 1.0]
|
122 |
-
bin_labels = [
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
# Count pairs in each bin
|
125 |
counts, _ = np.histogram(scores, bins=bins)
|
126 |
|
127 |
-
# Create bar chart with colors
|
128 |
-
colors = ['#ffcc66', '#ffaa44', '#ff6666']
|
129 |
-
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.
|
130 |
|
131 |
# Add value labels on bars
|
132 |
for i, (count, bar) in enumerate(zip(counts, bars)):
|
133 |
-
|
134 |
-
|
|
|
135 |
|
136 |
plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
|
137 |
plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
|
138 |
-
plt.title('
|
139 |
-
plt.xticks(range(len(bin_labels)), bin_labels, fontsize=
|
140 |
|
141 |
# Remove top and right spines
|
142 |
plt.gca().spines['top'].set_visible(False)
|
@@ -146,10 +149,12 @@ def create_similarity_barchart(similar_pairs):
|
|
146 |
plt.grid(axis='y', alpha=0.3)
|
147 |
|
148 |
# Add explanation
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
153 |
|
154 |
buf = BytesIO()
|
155 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
@@ -158,57 +163,62 @@ def create_similarity_barchart(similar_pairs):
|
|
158 |
|
159 |
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
160 |
|
161 |
-
def create_similarity_summary(overall_similarity,
|
162 |
"""Create a text summary of the similarity analysis"""
|
163 |
-
summary = f"## π Similarity
|
164 |
summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
|
165 |
|
166 |
-
if
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
174 |
summary += "**Similarity Breakdown:**\n"
|
175 |
-
summary += f"- π΄ Very Strong Similarity (90-100%): {
|
176 |
-
summary += f"- π‘ Strong Similarity (80-89%): {
|
177 |
-
summary += f"- π Good Similarity (70-79%): {
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
|
182 |
-
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
|
183 |
-
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
|
184 |
-
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
|
185 |
-
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
186 |
-
}
|
187 |
-
|
188 |
-
concept_counts = {concept: 0 for concept in concepts.keys()}
|
189 |
-
concept_counts['Other'] = 0
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
else:
|
207 |
-
summary += "No significant similarities found above the
|
208 |
|
209 |
return summary
|
210 |
|
211 |
-
def group_similar_concepts(
|
212 |
"""Group similar sentences by concept using keyword extraction"""
|
213 |
concept_groups = defaultdict(list)
|
214 |
|
@@ -220,7 +230,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
|
|
220 |
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
221 |
}
|
222 |
|
223 |
-
for sent1, sent2, score in
|
224 |
matched_concept = 'Other'
|
225 |
for concept, keywords in concepts.items():
|
226 |
if any(keyword in sent1.lower() for keyword in keywords) or \
|
@@ -231,6 +241,19 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
|
|
231 |
|
232 |
return concept_groups
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
def similarity(file1, file2):
|
235 |
if file1 is None or file2 is None:
|
236 |
return "Please upload both documents.", None, None
|
@@ -260,21 +283,23 @@ def similarity(file1, file2):
|
|
260 |
error_msg += f"Document 2: {text2}"
|
261 |
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
|
262 |
|
263 |
-
overall_similarity,
|
264 |
|
265 |
-
|
|
|
|
|
266 |
|
267 |
# Prepare detailed output
|
268 |
output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
|
269 |
|
270 |
-
if
|
271 |
-
output_html += f"<h4>Found {len(
|
272 |
|
273 |
for concept, pairs in concept_groups.items():
|
274 |
if pairs:
|
275 |
output_html += f"<h5>π {concept}:</h5>"
|
276 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
277 |
-
color =
|
278 |
output_html += f"""
|
279 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
280 |
<p><b>π Document 1:</b> {sent1}</p>
|
@@ -284,20 +309,20 @@ def similarity(file1, file2):
|
|
284 |
"""
|
285 |
else:
|
286 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
287 |
-
output_html += "<p>β οΈ No significant similarities found above the
|
288 |
output_html += "</div>"
|
289 |
|
290 |
-
# Generate bar chart
|
291 |
-
barchart_image = create_similarity_barchart(
|
292 |
-
summary_text = create_similarity_summary(overall_similarity,
|
293 |
|
294 |
return output_html, summary_text, barchart_image
|
295 |
|
296 |
# Create a clean Gradio interface
|
297 |
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
|
298 |
gr.Markdown("""
|
299 |
-
# π Document Similarity
|
300 |
-
Upload two documents (PDF or DOCX) to compare their content
|
301 |
""")
|
302 |
|
303 |
with gr.Row():
|
@@ -310,14 +335,18 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
310 |
with gr.Column(scale=2):
|
311 |
gr.Markdown("### Analysis Results")
|
312 |
summary_output = gr.Markdown()
|
313 |
-
output_html = gr.HTML(label="
|
314 |
|
315 |
gr.Markdown("""
|
316 |
-
### π Similarity Distribution
|
|
|
317 |
**Color Guide:**
|
318 |
-
- π΄ Very Strong Similarity (90-100%)
|
319 |
-
- π‘ Strong Similarity (80-89%)
|
320 |
-
- π Good Similarity (70-79%)
|
|
|
|
|
|
|
321 |
""")
|
322 |
barchart_display = gr.HTML()
|
323 |
|
@@ -327,7 +356,7 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
327 |
|
328 |
barchart_html = "<p>No similarity data available for visualization</p>"
|
329 |
if barchart_img:
|
330 |
-
barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
|
331 |
|
332 |
return result_html, summary_text, barchart_html
|
333 |
|
@@ -341,6 +370,5 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
341 |
# Launch the application
|
342 |
if __name__ == "__main__":
|
343 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
344 |
-
|
345 |
|
346 |
|
|
|
83 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
84 |
similarity_matrix = cosine_similarities.cpu().numpy()
|
85 |
|
86 |
+
# Find the most similar sentences (all pairs for comprehensive analysis)
|
87 |
+
all_pairs = []
|
|
|
88 |
|
89 |
for i in range(len(sentences1)):
|
|
|
|
|
|
|
90 |
for j in range(len(sentences2)):
|
91 |
+
similarity_score = similarity_matrix[i][j]
|
92 |
+
if similarity_score > 0.3: # Include even lower similarities for comprehensive analysis
|
93 |
+
all_pairs.append((sentences1[i], sentences2[j], similarity_score))
|
94 |
+
|
95 |
+
# Sort by similarity score (highest first)
|
96 |
+
all_pairs.sort(key=lambda x: x[2], reverse=True)
|
97 |
|
98 |
# Calculate overall similarity
|
99 |
max_similarities1 = np.max(similarity_matrix, axis=1)
|
|
|
101 |
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
|
102 |
overall_similarity = mean_similarity
|
103 |
|
104 |
+
return overall_similarity, all_pairs
|
105 |
|
106 |
+
def create_similarity_barchart(all_pairs):
|
107 |
+
"""Create a bar chart showing similarity distribution across all levels"""
|
108 |
+
if not all_pairs:
|
109 |
return None
|
110 |
|
111 |
+
plt.figure(figsize=(14, 8))
|
112 |
|
113 |
# Extract similarity scores
|
114 |
+
scores = [pair[2] for pair in all_pairs]
|
115 |
+
|
116 |
+
# Create bins for all similarity levels
|
117 |
+
bins = [0.3, 0.5, 0.7, 0.8, 0.9, 1.0]
|
118 |
+
bin_labels = [
|
119 |
+
'Slightly Related\n(30-49%)',
|
120 |
+
'Somewhat Related\n(50-69%)',
|
121 |
+
'Good Similarity\n(70-79%)',
|
122 |
+
'Strong Similarity\n(80-89%)',
|
123 |
+
'Very Strong Similarity\n(90-100%)'
|
124 |
+
]
|
125 |
|
126 |
# Count pairs in each bin
|
127 |
counts, _ = np.histogram(scores, bins=bins)
|
128 |
|
129 |
+
# Create bar chart with colors for all levels
|
130 |
+
colors = ['#cccccc', '#aaddff', '#ffcc66', '#ffaa44', '#ff6666']
|
131 |
+
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.7)
|
132 |
|
133 |
# Add value labels on bars
|
134 |
for i, (count, bar) in enumerate(zip(counts, bars)):
|
135 |
+
if count > 0:
|
136 |
+
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
137 |
+
str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
|
138 |
|
139 |
plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
|
140 |
plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
|
141 |
+
plt.title('Complete Similarity Distribution Analysis', fontsize=16, fontweight='bold', pad=20)
|
142 |
+
plt.xticks(range(len(bin_labels)), bin_labels, fontsize=11)
|
143 |
|
144 |
# Remove top and right spines
|
145 |
plt.gca().spines['top'].set_visible(False)
|
|
|
149 |
plt.grid(axis='y', alpha=0.3)
|
150 |
|
151 |
# Add explanation
|
152 |
+
explanation_text = (
|
153 |
+
"This chart shows the complete range of similarity between all sentence pairs in your documents.\n"
|
154 |
+
"Pairs with less than 30% similarity are not shown as they are considered not similar."
|
155 |
+
)
|
156 |
+
plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=11, style='italic',
|
157 |
+
bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
|
158 |
|
159 |
buf = BytesIO()
|
160 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
|
|
163 |
|
164 |
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
165 |
|
166 |
+
def create_similarity_summary(overall_similarity, all_pairs):
|
167 |
"""Create a text summary of the similarity analysis"""
|
168 |
+
summary = f"## π Complete Similarity Analysis\n\n"
|
169 |
summary += f"**Overall Similarity Score:** <span style='color: #4CAF50; font-size: 20px;'>{overall_similarity:.2%}</span>\n\n"
|
170 |
|
171 |
+
if all_pairs:
|
172 |
+
# Count pairs in each category
|
173 |
+
very_strong = len([p for p in all_pairs if p[2] >= 0.9])
|
174 |
+
strong = len([p for p in all_pairs if 0.8 <= p[2] < 0.9])
|
175 |
+
good = len([p for p in all_pairs if 0.7 <= p[2] < 0.8])
|
176 |
+
somewhat_related = len([p for p in all_pairs if 0.5 <= p[2] < 0.7])
|
177 |
+
slightly_related = len([p for p in all_pairs if 0.3 <= p[2] < 0.5])
|
178 |
|
179 |
summary += "**Similarity Breakdown:**\n"
|
180 |
+
summary += f"- π΄ Very Strong Similarity (90-100%): {very_strong} pairs\n"
|
181 |
+
summary += f"- π‘ Strong Similarity (80-89%): {strong} pairs\n"
|
182 |
+
summary += f"- π Good Similarity (70-79%): {good} pairs\n"
|
183 |
+
summary += f"- π΅ Somewhat Related (50-69%): {somewhat_related} pairs\n"
|
184 |
+
summary += f"- βͺ Slightly Related (30-49%): {slightly_related} pairs\n"
|
185 |
+
summary += f"- β Not Similar (0-29%): {len([p for p in all_pairs if p[2] < 0.3])} pairs (not shown)\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
+
# Most common concepts in higher similarity pairs
|
188 |
+
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
|
189 |
+
if high_similarity_pairs:
|
190 |
+
concepts = {
|
191 |
+
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
|
192 |
+
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
|
193 |
+
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
|
194 |
+
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
|
195 |
+
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
196 |
+
}
|
197 |
+
|
198 |
+
concept_counts = {concept: 0 for concept in concepts.keys()}
|
199 |
+
concept_counts['Other'] = 0
|
200 |
+
|
201 |
+
for sent1, sent2, score in high_similarity_pairs:
|
202 |
+
matched = False
|
203 |
+
for concept, keywords in concepts.items():
|
204 |
+
if any(keyword in sent1.lower() for keyword in keywords) or \
|
205 |
+
any(keyword in sent2.lower() for keyword in keywords):
|
206 |
+
concept_counts[concept] += 1
|
207 |
+
matched = True
|
208 |
+
break
|
209 |
+
if not matched:
|
210 |
+
concept_counts['Other'] += 1
|
211 |
+
|
212 |
+
summary += "**Highly Similar Content by Category:**\n"
|
213 |
+
for concept, count in concept_counts.items():
|
214 |
+
if count > 0:
|
215 |
+
summary += f"- {concept}: {count} pairs\n"
|
216 |
else:
|
217 |
+
summary += "No significant similarities found above the 30% threshold.\n"
|
218 |
|
219 |
return summary
|
220 |
|
221 |
+
def group_similar_concepts(all_pairs):
|
222 |
"""Group similar sentences by concept using keyword extraction"""
|
223 |
concept_groups = defaultdict(list)
|
224 |
|
|
|
230 |
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
231 |
}
|
232 |
|
233 |
+
for sent1, sent2, score in all_pairs:
|
234 |
matched_concept = 'Other'
|
235 |
for concept, keywords in concepts.items():
|
236 |
if any(keyword in sent1.lower() for keyword in keywords) or \
|
|
|
241 |
|
242 |
return concept_groups
|
243 |
|
244 |
+
def get_similarity_color(score):
|
245 |
+
"""Get color based on similarity score"""
|
246 |
+
if score >= 0.9:
|
247 |
+
return "#ff6666" # Red - Very Strong
|
248 |
+
elif score >= 0.8:
|
249 |
+
return "#ffaa44" # Orange - Strong
|
250 |
+
elif score >= 0.7:
|
251 |
+
return "#ffcc66" # Yellow - Good
|
252 |
+
elif score >= 0.5:
|
253 |
+
return "#aaddff" # Blue - Somewhat Related
|
254 |
+
else:
|
255 |
+
return "#cccccc" # Gray - Slightly Related
|
256 |
+
|
257 |
def similarity(file1, file2):
|
258 |
if file1 is None or file2 is None:
|
259 |
return "Please upload both documents.", None, None
|
|
|
283 |
error_msg += f"Document 2: {text2}"
|
284 |
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
|
285 |
|
286 |
+
overall_similarity, all_pairs = calculate_cosine_similarity(text1, text2)
|
287 |
|
288 |
+
# Filter to show only higher similarity pairs in detailed view (70%+)
|
289 |
+
high_similarity_pairs = [p for p in all_pairs if p[2] >= 0.7]
|
290 |
+
concept_groups = group_similar_concepts(high_similarity_pairs)
|
291 |
|
292 |
# Prepare detailed output
|
293 |
output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
|
294 |
|
295 |
+
if high_similarity_pairs:
|
296 |
+
output_html += f"<h4>Found {len(high_similarity_pairs)} significant similar sentence pairs (70%+):</h4>"
|
297 |
|
298 |
for concept, pairs in concept_groups.items():
|
299 |
if pairs:
|
300 |
output_html += f"<h5>π {concept}:</h5>"
|
301 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
302 |
+
color = get_similarity_color(score)
|
303 |
output_html += f"""
|
304 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
305 |
<p><b>π Document 1:</b> {sent1}</p>
|
|
|
309 |
"""
|
310 |
else:
|
311 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
312 |
+
output_html += "<p>β οΈ No significant similarities found above the 70% threshold.</p>"
|
313 |
output_html += "</div>"
|
314 |
|
315 |
+
# Generate bar chart showing ALL similarity levels
|
316 |
+
barchart_image = create_similarity_barchart(all_pairs)
|
317 |
+
summary_text = create_similarity_summary(overall_similarity, all_pairs)
|
318 |
|
319 |
return output_html, summary_text, barchart_image
|
320 |
|
321 |
# Create a clean Gradio interface
|
322 |
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
|
323 |
gr.Markdown("""
|
324 |
+
# π Complete Document Similarity Analyzer
|
325 |
+
Upload two documents (PDF or DOCX) to compare their content across all similarity levels.
|
326 |
""")
|
327 |
|
328 |
with gr.Row():
|
|
|
335 |
with gr.Column(scale=2):
|
336 |
gr.Markdown("### Analysis Results")
|
337 |
summary_output = gr.Markdown()
|
338 |
+
output_html = gr.HTML(label="Highly Similar Content (70%+)")
|
339 |
|
340 |
gr.Markdown("""
|
341 |
+
### π Complete Similarity Distribution
|
342 |
+
|
343 |
**Color Guide:**
|
344 |
+
- π΄ Very Strong Similarity (90-100%) - Essentially identical content
|
345 |
+
- π‘ Strong Similarity (80-89%) - Very similar with minor differences
|
346 |
+
- π Good Similarity (70-79%) - Related concepts with noticeable differences
|
347 |
+
- π΅ Somewhat Related (50-69%) - Shared concepts but different focus
|
348 |
+
- βͺ Slightly Related (30-49%) - Barely related topics
|
349 |
+
- β Not Similar (0-29%) - Completely different content (not shown)
|
350 |
""")
|
351 |
barchart_display = gr.HTML()
|
352 |
|
|
|
356 |
|
357 |
barchart_html = "<p>No similarity data available for visualization</p>"
|
358 |
if barchart_img:
|
359 |
+
barchart_html = f'<img src="{barchart_img}" alt="Complete Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
|
360 |
|
361 |
return result_html, summary_text, barchart_html
|
362 |
|
|
|
370 |
# Launch the application
|
371 |
if __name__ == "__main__":
|
372 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
373 |
|
374 |
|