Update app.py
Browse files
app.py
CHANGED
@@ -9,8 +9,6 @@ import numpy as np
|
|
9 |
from collections import defaultdict
|
10 |
import base64
|
11 |
from io import BytesIO
|
12 |
-
import pandas as pd
|
13 |
-
import seaborn as sns
|
14 |
|
15 |
# Try to import PyMuPDF with proper error handling
|
16 |
pymupdf_available = False
|
@@ -75,7 +73,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
75 |
sentences2 = preprocess_text(doc2)
|
76 |
|
77 |
if not sentences1 or not sentences2:
|
78 |
-
return 0.0, []
|
79 |
|
80 |
# Get embeddings for all sentences
|
81 |
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
@@ -107,142 +105,58 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
|
|
107 |
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
|
108 |
overall_similarity = mean_similarity
|
109 |
|
110 |
-
return overall_similarity, similar_pairs
|
111 |
|
112 |
-
def
|
113 |
-
"""Create
|
114 |
-
if
|
115 |
-
return None
|
116 |
-
|
117 |
-
visualizations = []
|
118 |
-
|
119 |
-
# 1. Improved Heatmap with clear explanation
|
120 |
-
plt.figure(figsize=(14, 10))
|
121 |
-
|
122 |
-
# Create a mask for values below threshold to make the heatmap clearer
|
123 |
-
mask = similarity_matrix < 0.3
|
124 |
-
|
125 |
-
# Use a clear color palette
|
126 |
-
ax = sns.heatmap(similarity_matrix,
|
127 |
-
mask=mask,
|
128 |
-
cmap='YlOrRd',
|
129 |
-
vmin=0.3,
|
130 |
-
vmax=1.0,
|
131 |
-
xticklabels=False,
|
132 |
-
yticklabels=False,
|
133 |
-
cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
|
134 |
-
|
135 |
-
plt.title('Document Similarity Heatmap\n\nπ΄ Red = Very Similar π‘ Yellow = Somewhat Similar βͺ White = Not Similar',
|
136 |
-
fontsize=16, pad=20)
|
137 |
-
plt.xlabel('Document 2 Sentences', fontsize=14)
|
138 |
-
plt.ylabel('Document 1 Sentences', fontsize=14)
|
139 |
-
|
140 |
-
# Add explanation text
|
141 |
-
explanation_text = (
|
142 |
-
"This heatmap shows how similar each sentence in Document 1 is to each sentence in Document 2.\n"
|
143 |
-
"Bright red areas indicate very similar content, yellow areas show some similarity, \n"
|
144 |
-
"and white areas indicate little to no similarity."
|
145 |
-
)
|
146 |
-
plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
|
147 |
|
148 |
-
buf = BytesIO()
|
149 |
-
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
150 |
-
plt.close()
|
151 |
-
buf.seek(0)
|
152 |
-
heatmap_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
153 |
-
visualizations.append(heatmap_img)
|
154 |
-
|
155 |
-
# 2. Similarity Distribution Chart
|
156 |
plt.figure(figsize=(12, 8))
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
|
161 |
|
162 |
# Create bins with labels
|
163 |
-
bins = [0.
|
164 |
-
bin_labels = ['
|
165 |
|
166 |
-
#
|
167 |
-
counts,
|
168 |
|
169 |
# Create bar chart with colors
|
170 |
-
colors = ['#
|
171 |
-
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black')
|
172 |
|
173 |
# Add value labels on bars
|
174 |
for i, (count, bar) in enumerate(zip(counts, bars)):
|
175 |
-
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.
|
176 |
-
str(count), ha='center', va='bottom', fontsize=
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
plt.
|
180 |
-
plt.
|
181 |
-
|
182 |
-
|
183 |
-
plt.
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
buf = BytesIO()
|
187 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
188 |
plt.close()
|
189 |
buf.seek(0)
|
190 |
-
dist_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
191 |
-
visualizations.append(dist_img)
|
192 |
-
|
193 |
-
# 3. Top Similarity Pairs Bar Chart
|
194 |
-
plt.figure(figsize=(14, 10))
|
195 |
-
|
196 |
-
# Get top similarity scores and their positions
|
197 |
-
top_n = min(8, len(sentences1) * len(sentences2))
|
198 |
-
if top_n > 0:
|
199 |
-
# Flatten and get indices of top values
|
200 |
-
flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
|
201 |
-
top_scores = similarity_matrix.flatten()[flat_indices]
|
202 |
-
|
203 |
-
# Convert flat indices to 2D indices
|
204 |
-
rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
|
205 |
-
|
206 |
-
# Create shortened labels for readability
|
207 |
-
labels = []
|
208 |
-
for r, c in zip(rows, cols):
|
209 |
-
sent1_short = sentences1[r][:50] + "..." if len(sentences1[r]) > 50 else sentences1[r]
|
210 |
-
sent2_short = sentences2[c][:50] + "..." if len(sentences2[c]) > 50 else sentences2[c]
|
211 |
-
labels.append(f"Pair {r+1}-{c+1}")
|
212 |
-
|
213 |
-
colors = ['#ff6666' if score >= 0.9 else '#ffcc66' if score >= 0.7 else '#66b3ff' for score in top_scores]
|
214 |
-
bars = plt.barh(range(len(top_scores)), top_scores, color=colors, edgecolor='black')
|
215 |
-
|
216 |
-
# Add value labels
|
217 |
-
for i, (score, bar) in enumerate(zip(top_scores, bars)):
|
218 |
-
plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
|
219 |
-
f'{score:.2%}', ha='left', va='center', fontsize=11, fontweight='bold')
|
220 |
-
|
221 |
-
plt.yticks(range(len(top_scores)), labels, fontsize=11)
|
222 |
-
plt.xlabel('Similarity Score', fontsize=14)
|
223 |
-
plt.title('Top 8 Most Similar Sentence Pairs', fontsize=16)
|
224 |
-
plt.xlim(0, 1.1)
|
225 |
-
plt.grid(True, alpha=0.3, axis='x')
|
226 |
-
|
227 |
-
# Add legend for colors
|
228 |
-
from matplotlib.patches import Patch
|
229 |
-
legend_elements = [
|
230 |
-
Patch(facecolor='#ff6666', label='Very Similar (β₯90%)'),
|
231 |
-
Patch(facecolor='#ffcc66', label='Similar (70-89%)'),
|
232 |
-
Patch(facecolor='#66b3ff', label='Somewhat Similar (30-69%)')
|
233 |
-
]
|
234 |
-
plt.legend(handles=legend_elements, loc='lower right')
|
235 |
-
|
236 |
-
buf = BytesIO()
|
237 |
-
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
238 |
-
plt.close()
|
239 |
-
buf.seek(0)
|
240 |
-
top_pairs_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
241 |
-
visualizations.append(top_pairs_img)
|
242 |
-
else:
|
243 |
-
visualizations.append(None)
|
244 |
|
245 |
-
return
|
246 |
|
247 |
def create_similarity_summary(overall_similarity, similar_pairs):
|
248 |
"""Create a text summary of the similarity analysis"""
|
@@ -254,25 +168,25 @@ def create_similarity_summary(overall_similarity, similar_pairs):
|
|
254 |
|
255 |
# Group by similarity ranges
|
256 |
high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
|
257 |
-
med_sim = len([p for p in similar_pairs if 0.
|
258 |
-
low_sim = len([p for p in similar_pairs if 0.
|
259 |
|
260 |
summary += "**Similarity Breakdown:**\n"
|
261 |
-
summary += f"- π΄ Very
|
262 |
-
summary += f"- π‘
|
263 |
-
summary += f"-
|
264 |
|
265 |
# Most common concepts
|
266 |
concepts = {
|
267 |
-
'
|
268 |
-
'
|
269 |
-
'
|
270 |
-
'
|
271 |
-
'
|
272 |
}
|
273 |
|
274 |
concept_counts = {concept: 0 for concept in concepts.keys()}
|
275 |
-
concept_counts['
|
276 |
|
277 |
for sent1, sent2, score in similar_pairs:
|
278 |
matched = False
|
@@ -283,14 +197,14 @@ def create_similarity_summary(overall_similarity, similar_pairs):
|
|
283 |
matched = True
|
284 |
break
|
285 |
if not matched:
|
286 |
-
concept_counts['
|
287 |
|
288 |
summary += "**Similar Content by Category:**\n"
|
289 |
for concept, count in concept_counts.items():
|
290 |
if count > 0:
|
291 |
-
summary += f"- {concept
|
292 |
else:
|
293 |
-
summary += "No significant similarities found above the
|
294 |
|
295 |
return summary
|
296 |
|
@@ -299,15 +213,15 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
|
|
299 |
concept_groups = defaultdict(list)
|
300 |
|
301 |
concepts = {
|
302 |
-
'
|
303 |
-
'
|
304 |
-
'
|
305 |
-
'
|
306 |
-
'
|
307 |
}
|
308 |
|
309 |
for sent1, sent2, score in similar_pairs:
|
310 |
-
matched_concept = '
|
311 |
for concept, keywords in concepts.items():
|
312 |
if any(keyword in sent1.lower() for keyword in keywords) or \
|
313 |
any(keyword in sent2.lower() for keyword in keywords):
|
@@ -319,7 +233,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
|
|
319 |
|
320 |
def similarity(file1, file2):
|
321 |
if file1 is None or file2 is None:
|
322 |
-
return "Please upload both documents.", None, None
|
323 |
|
324 |
try:
|
325 |
if file1.name.endswith('.pdf'):
|
@@ -327,16 +241,16 @@ def similarity(file1, file2):
|
|
327 |
elif file1.name.endswith('.docx'):
|
328 |
text1 = extract_text_from_docx(file1.name)
|
329 |
else:
|
330 |
-
return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
|
331 |
|
332 |
if file2.name.endswith('.pdf'):
|
333 |
text2 = extract_text_from_pdf(file2.name)
|
334 |
elif file2.name.endswith('.docx'):
|
335 |
text2 = extract_text_from_docx(file2.name)
|
336 |
else:
|
337 |
-
return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
|
338 |
except Exception as e:
|
339 |
-
return f"Error processing files: {str(e)}", None, None
|
340 |
|
341 |
if not text1 or not text2 or "Error" in text1 or "Error" in text2:
|
342 |
error_msg = ""
|
@@ -344,9 +258,9 @@ def similarity(file1, file2):
|
|
344 |
error_msg += f"Document 1: {text1} "
|
345 |
if "Error" in text2:
|
346 |
error_msg += f"Document 2: {text2}"
|
347 |
-
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
|
348 |
|
349 |
-
overall_similarity, similar_pairs
|
350 |
|
351 |
concept_groups = group_similar_concepts(similar_pairs)
|
352 |
|
@@ -358,9 +272,9 @@ def similarity(file1, file2):
|
|
358 |
|
359 |
for concept, pairs in concept_groups.items():
|
360 |
if pairs:
|
361 |
-
output_html += f"<h5
|
362 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
363 |
-
color = "#ff6666" if score >= 0.9 else "#
|
364 |
output_html += f"""
|
365 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
366 |
<p><b>π Document 1:</b> {sent1}</p>
|
@@ -370,26 +284,19 @@ def similarity(file1, file2):
|
|
370 |
"""
|
371 |
else:
|
372 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
373 |
-
output_html += "<p>β οΈ No significant similarities found above the threshold (
|
374 |
output_html += "</div>"
|
375 |
|
376 |
-
# Generate
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
visualizations = [None, None, None]
|
381 |
-
summary_text = ""
|
382 |
|
383 |
-
|
384 |
-
visualizations = create_similarity_visualizations(sentences1, sentences2, similarity_matrix)
|
385 |
-
summary_text = create_similarity_summary(overall_similarity, similar_pairs)
|
386 |
-
|
387 |
-
return output_html, summary_text, visualizations[0], visualizations[1], visualizations[2]
|
388 |
|
389 |
# Create a clean Gradio interface
|
390 |
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
|
391 |
gr.Markdown("""
|
392 |
-
# π Document Similarity Checker
|
393 |
Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
|
394 |
""")
|
395 |
|
@@ -405,58 +312,35 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
|
|
405 |
summary_output = gr.Markdown()
|
406 |
output_html = gr.HTML(label="Detailed Similarities")
|
407 |
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
""")
|
417 |
-
heatmap_display = gr.HTML()
|
418 |
-
with gr.Column():
|
419 |
-
gr.Markdown("""
|
420 |
-
### π Similarity Distribution
|
421 |
-
Shows how many sentence pairs fall into each similarity range.
|
422 |
-
The red line indicates the 70% similarity threshold.
|
423 |
-
""")
|
424 |
-
dist_display = gr.HTML()
|
425 |
-
|
426 |
-
with gr.Row():
|
427 |
-
gr.Markdown("""
|
428 |
-
### π Top Similar Pairs
|
429 |
-
The most similar sentences between your documents, with similarity scores.
|
430 |
-
""")
|
431 |
-
top_pairs_display = gr.HTML()
|
432 |
|
433 |
# Define the processing function
|
434 |
def process_files(file1, file2):
|
435 |
-
result_html, summary_text,
|
436 |
|
437 |
-
|
438 |
-
if
|
439 |
-
|
440 |
|
441 |
-
|
442 |
-
if dist_img:
|
443 |
-
dist_html = f'<img src="{dist_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
|
444 |
-
|
445 |
-
top_pairs_html = "<p>No visualization available</p>"
|
446 |
-
if top_pairs_img:
|
447 |
-
top_pairs_html = f'<img src="{top_pairs_img}" alt="Top Similar Pairs" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
|
448 |
-
|
449 |
-
return result_html, summary_text, heatmap_html, dist_html, top_pairs_html
|
450 |
|
451 |
# Connect the button
|
452 |
submit_btn.click(
|
453 |
fn=process_files,
|
454 |
inputs=[file1, file2],
|
455 |
-
outputs=[output_html, summary_output,
|
456 |
)
|
457 |
|
458 |
# Launch the application
|
459 |
if __name__ == "__main__":
|
460 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
461 |
|
462 |
|
|
|
9 |
from collections import defaultdict
|
10 |
import base64
|
11 |
from io import BytesIO
|
|
|
|
|
12 |
|
13 |
# Try to import PyMuPDF with proper error handling
|
14 |
pymupdf_available = False
|
|
|
73 |
sentences2 = preprocess_text(doc2)
|
74 |
|
75 |
if not sentences1 or not sentences2:
|
76 |
+
return 0.0, []
|
77 |
|
78 |
# Get embeddings for all sentences
|
79 |
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
|
|
105 |
mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
|
106 |
overall_similarity = mean_similarity
|
107 |
|
108 |
+
return overall_similarity, similar_pairs
|
109 |
|
110 |
+
def create_similarity_barchart(similar_pairs):
|
111 |
+
"""Create a bar chart showing similarity distribution"""
|
112 |
+
if not similar_pairs:
|
113 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
plt.figure(figsize=(12, 8))
|
116 |
|
117 |
+
# Extract similarity scores
|
118 |
+
scores = [pair[2] for pair in similar_pairs]
|
|
|
119 |
|
120 |
# Create bins with labels
|
121 |
+
bins = [0.7, 0.8, 0.9, 1.0]
|
122 |
+
bin_labels = ['Good (70-79%)', 'Strong (80-89%)', 'Very Strong (90-100%)']
|
123 |
|
124 |
+
# Count pairs in each bin
|
125 |
+
counts, _ = np.histogram(scores, bins=bins)
|
126 |
|
127 |
# Create bar chart with colors
|
128 |
+
colors = ['#ffcc66', '#ffaa44', '#ff6666']
|
129 |
+
bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.6)
|
130 |
|
131 |
# Add value labels on bars
|
132 |
for i, (count, bar) in enumerate(zip(counts, bars)):
|
133 |
+
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
134 |
+
str(count), ha='center', va='bottom', fontsize=14, fontweight='bold')
|
135 |
+
|
136 |
+
plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
|
137 |
+
plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
|
138 |
+
plt.title('Document Similarity Distribution', fontsize=16, fontweight='bold', pad=20)
|
139 |
+
plt.xticks(range(len(bin_labels)), bin_labels, fontsize=12)
|
140 |
|
141 |
+
# Remove top and right spines
|
142 |
+
plt.gca().spines['top'].set_visible(False)
|
143 |
+
plt.gca().spines['right'].set_visible(False)
|
144 |
+
|
145 |
+
# Add grid for better readability
|
146 |
+
plt.grid(axis='y', alpha=0.3)
|
147 |
+
|
148 |
+
# Add explanation
|
149 |
+
plt.figtext(0.5, 0.01,
|
150 |
+
"This chart shows how many sentence pairs fall into each similarity range.\n"
|
151 |
+
"Higher bars indicate more content shared between documents at that similarity level.",
|
152 |
+
ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
|
153 |
|
154 |
buf = BytesIO()
|
155 |
plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
|
156 |
plt.close()
|
157 |
buf.seek(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
+
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
|
160 |
|
161 |
def create_similarity_summary(overall_similarity, similar_pairs):
|
162 |
"""Create a text summary of the similarity analysis"""
|
|
|
168 |
|
169 |
# Group by similarity ranges
|
170 |
high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
|
171 |
+
med_sim = len([p for p in similar_pairs if 0.8 <= p[2] < 0.9])
|
172 |
+
low_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.8])
|
173 |
|
174 |
summary += "**Similarity Breakdown:**\n"
|
175 |
+
summary += f"- π΄ Very Strong Similarity (90-100%): {high_sim} pairs\n"
|
176 |
+
summary += f"- π‘ Strong Similarity (80-89%): {med_sim} pairs\n"
|
177 |
+
summary += f"- π Good Similarity (70-79%): {low_sim} pairs\n\n"
|
178 |
|
179 |
# Most common concepts
|
180 |
concepts = {
|
181 |
+
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
|
182 |
+
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
|
183 |
+
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
|
184 |
+
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
|
185 |
+
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
186 |
}
|
187 |
|
188 |
concept_counts = {concept: 0 for concept in concepts.keys()}
|
189 |
+
concept_counts['Other'] = 0
|
190 |
|
191 |
for sent1, sent2, score in similar_pairs:
|
192 |
matched = False
|
|
|
197 |
matched = True
|
198 |
break
|
199 |
if not matched:
|
200 |
+
concept_counts['Other'] += 1
|
201 |
|
202 |
summary += "**Similar Content by Category:**\n"
|
203 |
for concept, count in concept_counts.items():
|
204 |
if count > 0:
|
205 |
+
summary += f"- {concept}: {count} pairs\n"
|
206 |
else:
|
207 |
+
summary += "No significant similarities found above the 70% threshold.\n"
|
208 |
|
209 |
return summary
|
210 |
|
|
|
213 |
concept_groups = defaultdict(list)
|
214 |
|
215 |
concepts = {
|
216 |
+
'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
|
217 |
+
'Education': ['education', 'learn', 'course', 'degree', 'academic'],
|
218 |
+
'Experience': ['experience', 'work', 'job', 'intern', 'position'],
|
219 |
+
'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
|
220 |
+
'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
221 |
}
|
222 |
|
223 |
for sent1, sent2, score in similar_pairs:
|
224 |
+
matched_concept = 'Other'
|
225 |
for concept, keywords in concepts.items():
|
226 |
if any(keyword in sent1.lower() for keyword in keywords) or \
|
227 |
any(keyword in sent2.lower() for keyword in keywords):
|
|
|
233 |
|
234 |
def similarity(file1, file2):
|
235 |
if file1 is None or file2 is None:
|
236 |
+
return "Please upload both documents.", None, None
|
237 |
|
238 |
try:
|
239 |
if file1.name.endswith('.pdf'):
|
|
|
241 |
elif file1.name.endswith('.docx'):
|
242 |
text1 = extract_text_from_docx(file1.name)
|
243 |
else:
|
244 |
+
return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
|
245 |
|
246 |
if file2.name.endswith('.pdf'):
|
247 |
text2 = extract_text_from_pdf(file2.name)
|
248 |
elif file2.name.endswith('.docx'):
|
249 |
text2 = extract_text_from_docx(file2.name)
|
250 |
else:
|
251 |
+
return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
|
252 |
except Exception as e:
|
253 |
+
return f"Error processing files: {str(e)}", None, None
|
254 |
|
255 |
if not text1 or not text2 or "Error" in text1 or "Error" in text2:
|
256 |
error_msg = ""
|
|
|
258 |
error_msg += f"Document 1: {text1} "
|
259 |
if "Error" in text2:
|
260 |
error_msg += f"Document 2: {text2}"
|
261 |
+
return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
|
262 |
|
263 |
+
overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
|
264 |
|
265 |
concept_groups = group_similar_concepts(similar_pairs)
|
266 |
|
|
|
272 |
|
273 |
for concept, pairs in concept_groups.items():
|
274 |
if pairs:
|
275 |
+
output_html += f"<h5>οΏ½οΏ½οΏ½ {concept}:</h5>"
|
276 |
for i, (sent1, sent2, score) in enumerate(pairs):
|
277 |
+
color = "#ff6666" if score >= 0.9 else "#ffaa44" if score >= 0.8 else "#ffcc66"
|
278 |
output_html += f"""
|
279 |
<div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
|
280 |
<p><b>π Document 1:</b> {sent1}</p>
|
|
|
284 |
"""
|
285 |
else:
|
286 |
output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
|
287 |
+
output_html += "<p>β οΈ No significant similarities found above the threshold (70%).</p>"
|
288 |
output_html += "</div>"
|
289 |
|
290 |
+
# Generate bar chart
|
291 |
+
barchart_image = create_similarity_barchart(similar_pairs)
|
292 |
+
summary_text = create_similarity_summary(overall_similarity, similar_pairs)
|
|
|
|
|
|
|
293 |
|
294 |
+
return output_html, summary_text, barchart_image
|
|
|
|
|
|
|
|
|
295 |
|
296 |
# Create a clean Gradio interface
|
297 |
with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
|
298 |
gr.Markdown("""
|
299 |
+
# π Document Similarity Checker
|
300 |
Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
|
301 |
""")
|
302 |
|
|
|
312 |
summary_output = gr.Markdown()
|
313 |
output_html = gr.HTML(label="Detailed Similarities")
|
314 |
|
315 |
+
gr.Markdown("""
|
316 |
+
### π Similarity Distribution
|
317 |
+
**Color Guide:**
|
318 |
+
- π΄ Very Strong Similarity (90-100%)
|
319 |
+
- π‘ Strong Similarity (80-89%)
|
320 |
+
- π Good Similarity (70-79%)
|
321 |
+
""")
|
322 |
+
barchart_display = gr.HTML()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
# Define the processing function
|
325 |
def process_files(file1, file2):
|
326 |
+
result_html, summary_text, barchart_img = similarity(file1, file2)
|
327 |
|
328 |
+
barchart_html = "<p>No similarity data available for visualization</p>"
|
329 |
+
if barchart_img:
|
330 |
+
barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
|
331 |
|
332 |
+
return result_html, summary_text, barchart_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
# Connect the button
|
335 |
submit_btn.click(
|
336 |
fn=process_files,
|
337 |
inputs=[file1, file2],
|
338 |
+
outputs=[output_html, summary_output, barchart_display]
|
339 |
)
|
340 |
|
341 |
# Launch the application
|
342 |
if __name__ == "__main__":
|
343 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
344 |
+
|
345 |
|
346 |
|