NaimaAqeel commited on
Commit
092f11f
Β·
verified Β·
1 Parent(s): 714e663

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -199
app.py CHANGED
@@ -9,8 +9,6 @@ import numpy as np
9
  from collections import defaultdict
10
  import base64
11
  from io import BytesIO
12
- import pandas as pd
13
- import seaborn as sns
14
 
15
  # Try to import PyMuPDF with proper error handling
16
  pymupdf_available = False
@@ -75,7 +73,7 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
75
  sentences2 = preprocess_text(doc2)
76
 
77
  if not sentences1 or not sentences2:
78
- return 0.0, [], np.array([])
79
 
80
  # Get embeddings for all sentences
81
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
@@ -107,142 +105,58 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
107
  mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
108
  overall_similarity = mean_similarity
109
 
110
- return overall_similarity, similar_pairs, similarity_matrix
111
 
112
- def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
113
- """Create multiple visualizations for similarity analysis"""
114
- if len(sentences1) == 0 or len(sentences2) == 0:
115
- return None, None, None
116
-
117
- visualizations = []
118
-
119
- # 1. Improved Heatmap with clear explanation
120
- plt.figure(figsize=(14, 10))
121
-
122
- # Create a mask for values below threshold to make the heatmap clearer
123
- mask = similarity_matrix < 0.3
124
-
125
- # Use a clear color palette
126
- ax = sns.heatmap(similarity_matrix,
127
- mask=mask,
128
- cmap='YlOrRd',
129
- vmin=0.3,
130
- vmax=1.0,
131
- xticklabels=False,
132
- yticklabels=False,
133
- cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
134
-
135
- plt.title('Document Similarity Heatmap\n\nπŸ”΄ Red = Very Similar 🟑 Yellow = Somewhat Similar βšͺ White = Not Similar',
136
- fontsize=16, pad=20)
137
- plt.xlabel('Document 2 Sentences', fontsize=14)
138
- plt.ylabel('Document 1 Sentences', fontsize=14)
139
-
140
- # Add explanation text
141
- explanation_text = (
142
- "This heatmap shows how similar each sentence in Document 1 is to each sentence in Document 2.\n"
143
- "Bright red areas indicate very similar content, yellow areas show some similarity, \n"
144
- "and white areas indicate little to no similarity."
145
- )
146
- plt.figtext(0.5, 0.01, explanation_text, ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
147
 
148
- buf = BytesIO()
149
- plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
150
- plt.close()
151
- buf.seek(0)
152
- heatmap_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
153
- visualizations.append(heatmap_img)
154
-
155
- # 2. Similarity Distribution Chart
156
  plt.figure(figsize=(12, 8))
157
 
158
- # Flatten the similarity matrix and filter out low similarities
159
- flat_similarities = similarity_matrix.flatten()
160
- flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
161
 
162
  # Create bins with labels
163
- bins = [0.3, 0.5, 0.7, 0.9, 1.0]
164
- bin_labels = ['Low (30-50%)', 'Medium (50-70%)', 'High (70-90%)', 'Very High (90-100%)']
165
 
166
- # Create histogram
167
- counts, bin_edges = np.histogram(flat_similarities, bins=bins)
168
 
169
  # Create bar chart with colors
170
- colors = ['#ff9999', '#ffcc99', '#c2e699', '#66b3ff']
171
- bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black')
172
 
173
  # Add value labels on bars
174
  for i, (count, bar) in enumerate(zip(counts, bars)):
175
- plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
176
- str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
 
 
 
 
 
177
 
178
- plt.axvline(x=1.5, color='red', linestyle='--', linewidth=2, label='Similarity Threshold (70%)')
179
- plt.xlabel('Similarity Level', fontsize=14)
180
- plt.ylabel('Number of Sentence Pairs', fontsize=14)
181
- plt.title('Distribution of Sentence Similarities', fontsize=16)
182
- plt.xticks(range(len(bin_labels)), bin_labels, rotation=45, ha='right')
183
- plt.legend(fontsize=12)
184
- plt.grid(True, alpha=0.3)
 
 
 
 
 
185
 
186
  buf = BytesIO()
187
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
188
  plt.close()
189
  buf.seek(0)
190
- dist_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
191
- visualizations.append(dist_img)
192
-
193
- # 3. Top Similarity Pairs Bar Chart
194
- plt.figure(figsize=(14, 10))
195
-
196
- # Get top similarity scores and their positions
197
- top_n = min(8, len(sentences1) * len(sentences2))
198
- if top_n > 0:
199
- # Flatten and get indices of top values
200
- flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
201
- top_scores = similarity_matrix.flatten()[flat_indices]
202
-
203
- # Convert flat indices to 2D indices
204
- rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
205
-
206
- # Create shortened labels for readability
207
- labels = []
208
- for r, c in zip(rows, cols):
209
- sent1_short = sentences1[r][:50] + "..." if len(sentences1[r]) > 50 else sentences1[r]
210
- sent2_short = sentences2[c][:50] + "..." if len(sentences2[c]) > 50 else sentences2[c]
211
- labels.append(f"Pair {r+1}-{c+1}")
212
-
213
- colors = ['#ff6666' if score >= 0.9 else '#ffcc66' if score >= 0.7 else '#66b3ff' for score in top_scores]
214
- bars = plt.barh(range(len(top_scores)), top_scores, color=colors, edgecolor='black')
215
-
216
- # Add value labels
217
- for i, (score, bar) in enumerate(zip(top_scores, bars)):
218
- plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
219
- f'{score:.2%}', ha='left', va='center', fontsize=11, fontweight='bold')
220
-
221
- plt.yticks(range(len(top_scores)), labels, fontsize=11)
222
- plt.xlabel('Similarity Score', fontsize=14)
223
- plt.title('Top 8 Most Similar Sentence Pairs', fontsize=16)
224
- plt.xlim(0, 1.1)
225
- plt.grid(True, alpha=0.3, axis='x')
226
-
227
- # Add legend for colors
228
- from matplotlib.patches import Patch
229
- legend_elements = [
230
- Patch(facecolor='#ff6666', label='Very Similar (β‰₯90%)'),
231
- Patch(facecolor='#ffcc66', label='Similar (70-89%)'),
232
- Patch(facecolor='#66b3ff', label='Somewhat Similar (30-69%)')
233
- ]
234
- plt.legend(handles=legend_elements, loc='lower right')
235
-
236
- buf = BytesIO()
237
- plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
238
- plt.close()
239
- buf.seek(0)
240
- top_pairs_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
241
- visualizations.append(top_pairs_img)
242
- else:
243
- visualizations.append(None)
244
 
245
- return visualizations
246
 
247
  def create_similarity_summary(overall_similarity, similar_pairs):
248
  """Create a text summary of the similarity analysis"""
@@ -254,25 +168,25 @@ def create_similarity_summary(overall_similarity, similar_pairs):
254
 
255
  # Group by similarity ranges
256
  high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
257
- med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
258
- low_sim = len([p for p in similar_pairs if 0.3 <= p[2] < 0.7])
259
 
260
  summary += "**Similarity Breakdown:**\n"
261
- summary += f"- πŸ”΄ Very High Similarity (β‰₯90%): {high_sim} pairs\n"
262
- summary += f"- 🟑 High Similarity (70-89%): {med_sim} pairs\n"
263
- summary += f"- πŸ”΅ Some Similarity (30-69%): {low_sim} pairs\n\n"
264
 
265
  # Most common concepts
266
  concepts = {
267
- 'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
268
- 'education': ['education', 'learn', 'course', 'degree', 'academic'],
269
- 'experience': ['experience', 'work', 'job', 'intern', 'position'],
270
- 'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
271
- 'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
272
  }
273
 
274
  concept_counts = {concept: 0 for concept in concepts.keys()}
275
- concept_counts['other'] = 0
276
 
277
  for sent1, sent2, score in similar_pairs:
278
  matched = False
@@ -283,14 +197,14 @@ def create_similarity_summary(overall_similarity, similar_pairs):
283
  matched = True
284
  break
285
  if not matched:
286
- concept_counts['other'] += 1
287
 
288
  summary += "**Similar Content by Category:**\n"
289
  for concept, count in concept_counts.items():
290
  if count > 0:
291
- summary += f"- {concept.capitalize()}: {count} pairs\n"
292
  else:
293
- summary += "No significant similarities found above the 30% threshold.\n"
294
 
295
  return summary
296
 
@@ -299,15 +213,15 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
299
  concept_groups = defaultdict(list)
300
 
301
  concepts = {
302
- 'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
303
- 'education': ['education', 'learn', 'course', 'degree', 'academic'],
304
- 'experience': ['experience', 'work', 'job', 'intern', 'position'],
305
- 'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
306
- 'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
307
  }
308
 
309
  for sent1, sent2, score in similar_pairs:
310
- matched_concept = 'other'
311
  for concept, keywords in concepts.items():
312
  if any(keyword in sent1.lower() for keyword in keywords) or \
313
  any(keyword in sent2.lower() for keyword in keywords):
@@ -319,7 +233,7 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
319
 
320
  def similarity(file1, file2):
321
  if file1 is None or file2 is None:
322
- return "Please upload both documents.", None, None, None, None
323
 
324
  try:
325
  if file1.name.endswith('.pdf'):
@@ -327,16 +241,16 @@ def similarity(file1, file2):
327
  elif file1.name.endswith('.docx'):
328
  text1 = extract_text_from_docx(file1.name)
329
  else:
330
- return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None, None, None
331
 
332
  if file2.name.endswith('.pdf'):
333
  text2 = extract_text_from_pdf(file2.name)
334
  elif file2.name.endswith('.docx'):
335
  text2 = extract_text_from_docx(file2.name)
336
  else:
337
- return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None, None, None
338
  except Exception as e:
339
- return f"Error processing files: {str(e)}", None, None, None, None
340
 
341
  if not text1 or not text2 or "Error" in text1 or "Error" in text2:
342
  error_msg = ""
@@ -344,9 +258,9 @@ def similarity(file1, file2):
344
  error_msg += f"Document 1: {text1} "
345
  if "Error" in text2:
346
  error_msg += f"Document 2: {text2}"
347
- return error_msg if error_msg else "Error extracting text from one or both documents.", None, None, None, None
348
 
349
- overall_similarity, similar_pairs, similarity_matrix = calculate_cosine_similarity(text1, text2)
350
 
351
  concept_groups = group_similar_concepts(similar_pairs)
352
 
@@ -358,9 +272,9 @@ def similarity(file1, file2):
358
 
359
  for concept, pairs in concept_groups.items():
360
  if pairs:
361
- output_html += f"<h5>πŸ” {concept.capitalize()}:</h5>"
362
  for i, (sent1, sent2, score) in enumerate(pairs):
363
- color = "#ff6666" if score >= 0.9 else "#ffcc66" if score >= 0.7 else "#66b3ff"
364
  output_html += f"""
365
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
366
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
@@ -370,26 +284,19 @@ def similarity(file1, file2):
370
  """
371
  else:
372
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
373
- output_html += "<p>⚠️ No significant similarities found above the threshold (30%).</p>"
374
  output_html += "</div>"
375
 
376
- # Generate visualizations
377
- sentences1 = preprocess_text(text1)
378
- sentences2 = preprocess_text(text2)
379
-
380
- visualizations = [None, None, None]
381
- summary_text = ""
382
 
383
- if sentences1 and sentences2:
384
- visualizations = create_similarity_visualizations(sentences1, sentences2, similarity_matrix)
385
- summary_text = create_similarity_summary(overall_similarity, similar_pairs)
386
-
387
- return output_html, summary_text, visualizations[0], visualizations[1], visualizations[2]
388
 
389
  # Create a clean Gradio interface
390
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
391
  gr.Markdown("""
392
- # πŸ“„ Document Similarity Checker with Detailed Analysis
393
  Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
394
  """)
395
 
@@ -405,58 +312,35 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
405
  summary_output = gr.Markdown()
406
  output_html = gr.HTML(label="Detailed Similarities")
407
 
408
- with gr.Row():
409
- with gr.Column():
410
- gr.Markdown("""
411
- ### πŸ“ˆ Similarity Heatmap
412
- **Color Guide:**
413
- - πŸ”΄ Red = Very Similar (90-100%)
414
- - 🟑 Yellow = Somewhat Similar (70-89%)
415
- - βšͺ White = Not Similar (0-69%)
416
- """)
417
- heatmap_display = gr.HTML()
418
- with gr.Column():
419
- gr.Markdown("""
420
- ### πŸ“Š Similarity Distribution
421
- Shows how many sentence pairs fall into each similarity range.
422
- The red line indicates the 70% similarity threshold.
423
- """)
424
- dist_display = gr.HTML()
425
-
426
- with gr.Row():
427
- gr.Markdown("""
428
- ### πŸ” Top Similar Pairs
429
- The most similar sentences between your documents, with similarity scores.
430
- """)
431
- top_pairs_display = gr.HTML()
432
 
433
  # Define the processing function
434
  def process_files(file1, file2):
435
- result_html, summary_text, heatmap_img, dist_img, top_pairs_img = similarity(file1, file2)
436
 
437
- heatmap_html = "<p>No visualization available</p>"
438
- if heatmap_img:
439
- heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
440
 
441
- dist_html = "<p>No visualization available</p>"
442
- if dist_img:
443
- dist_html = f'<img src="{dist_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
444
-
445
- top_pairs_html = "<p>No visualization available</p>"
446
- if top_pairs_img:
447
- top_pairs_html = f'<img src="{top_pairs_img}" alt="Top Similar Pairs" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
448
-
449
- return result_html, summary_text, heatmap_html, dist_html, top_pairs_html
450
 
451
  # Connect the button
452
  submit_btn.click(
453
  fn=process_files,
454
  inputs=[file1, file2],
455
- outputs=[output_html, summary_output, heatmap_display, dist_display, top_pairs_display]
456
  )
457
 
458
  # Launch the application
459
  if __name__ == "__main__":
460
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
461
 
462
 
 
9
  from collections import defaultdict
10
  import base64
11
  from io import BytesIO
 
 
12
 
13
  # Try to import PyMuPDF with proper error handling
14
  pymupdf_available = False
 
73
  sentences2 = preprocess_text(doc2)
74
 
75
  if not sentences1 or not sentences2:
76
+ return 0.0, []
77
 
78
  # Get embeddings for all sentences
79
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
 
105
  mean_similarity = (np.mean(max_similarities1) + np.mean(max_similarities2)) / 2.0
106
  overall_similarity = mean_similarity
107
 
108
+ return overall_similarity, similar_pairs
109
 
110
+ def create_similarity_barchart(similar_pairs):
111
+ """Create a bar chart showing similarity distribution"""
112
+ if not similar_pairs:
113
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
 
 
 
 
 
 
 
 
115
  plt.figure(figsize=(12, 8))
116
 
117
+ # Extract similarity scores
118
+ scores = [pair[2] for pair in similar_pairs]
 
119
 
120
  # Create bins with labels
121
+ bins = [0.7, 0.8, 0.9, 1.0]
122
+ bin_labels = ['Good (70-79%)', 'Strong (80-89%)', 'Very Strong (90-100%)']
123
 
124
+ # Count pairs in each bin
125
+ counts, _ = np.histogram(scores, bins=bins)
126
 
127
  # Create bar chart with colors
128
+ colors = ['#ffcc66', '#ffaa44', '#ff6666']
129
+ bars = plt.bar(range(len(counts)), counts, color=colors, edgecolor='black', width=0.6)
130
 
131
  # Add value labels on bars
132
  for i, (count, bar) in enumerate(zip(counts, bars)):
133
+ plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
134
+ str(count), ha='center', va='bottom', fontsize=14, fontweight='bold')
135
+
136
+ plt.xlabel('Similarity Level', fontsize=14, fontweight='bold')
137
+ plt.ylabel('Number of Sentence Pairs', fontsize=14, fontweight='bold')
138
+ plt.title('Document Similarity Distribution', fontsize=16, fontweight='bold', pad=20)
139
+ plt.xticks(range(len(bin_labels)), bin_labels, fontsize=12)
140
 
141
+ # Remove top and right spines
142
+ plt.gca().spines['top'].set_visible(False)
143
+ plt.gca().spines['right'].set_visible(False)
144
+
145
+ # Add grid for better readability
146
+ plt.grid(axis='y', alpha=0.3)
147
+
148
+ # Add explanation
149
+ plt.figtext(0.5, 0.01,
150
+ "This chart shows how many sentence pairs fall into each similarity range.\n"
151
+ "Higher bars indicate more content shared between documents at that similarity level.",
152
+ ha="center", fontsize=11, style='italic', bbox={"facecolor":"#f0f0f0", "alpha":0.7, "pad":5})
153
 
154
  buf = BytesIO()
155
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
156
  plt.close()
157
  buf.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
160
 
161
  def create_similarity_summary(overall_similarity, similar_pairs):
162
  """Create a text summary of the similarity analysis"""
 
168
 
169
  # Group by similarity ranges
170
  high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
171
+ med_sim = len([p for p in similar_pairs if 0.8 <= p[2] < 0.9])
172
+ low_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.8])
173
 
174
  summary += "**Similarity Breakdown:**\n"
175
+ summary += f"- πŸ”΄ Very Strong Similarity (90-100%): {high_sim} pairs\n"
176
+ summary += f"- 🟑 Strong Similarity (80-89%): {med_sim} pairs\n"
177
+ summary += f"- 🟠 Good Similarity (70-79%): {low_sim} pairs\n\n"
178
 
179
  # Most common concepts
180
  concepts = {
181
+ 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
182
+ 'Education': ['education', 'learn', 'course', 'degree', 'academic'],
183
+ 'Experience': ['experience', 'work', 'job', 'intern', 'position'],
184
+ 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
185
+ 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
186
  }
187
 
188
  concept_counts = {concept: 0 for concept in concepts.keys()}
189
+ concept_counts['Other'] = 0
190
 
191
  for sent1, sent2, score in similar_pairs:
192
  matched = False
 
197
  matched = True
198
  break
199
  if not matched:
200
+ concept_counts['Other'] += 1
201
 
202
  summary += "**Similar Content by Category:**\n"
203
  for concept, count in concept_counts.items():
204
  if count > 0:
205
+ summary += f"- {concept}: {count} pairs\n"
206
  else:
207
+ summary += "No significant similarities found above the 70% threshold.\n"
208
 
209
  return summary
210
 
 
213
  concept_groups = defaultdict(list)
214
 
215
  concepts = {
216
+ 'Research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
217
+ 'Education': ['education', 'learn', 'course', 'degree', 'academic'],
218
+ 'Experience': ['experience', 'work', 'job', 'intern', 'position'],
219
+ 'Goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
220
+ 'Skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
221
  }
222
 
223
  for sent1, sent2, score in similar_pairs:
224
+ matched_concept = 'Other'
225
  for concept, keywords in concepts.items():
226
  if any(keyword in sent1.lower() for keyword in keywords) or \
227
  any(keyword in sent2.lower() for keyword in keywords):
 
233
 
234
  def similarity(file1, file2):
235
  if file1 is None or file2 is None:
236
+ return "Please upload both documents.", None, None
237
 
238
  try:
239
  if file1.name.endswith('.pdf'):
 
241
  elif file1.name.endswith('.docx'):
242
  text1 = extract_text_from_docx(file1.name)
243
  else:
244
+ return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None
245
 
246
  if file2.name.endswith('.pdf'):
247
  text2 = extract_text_from_pdf(file2.name)
248
  elif file2.name.endswith('.docx'):
249
  text2 = extract_text_from_docx(file2.name)
250
  else:
251
+ return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None
252
  except Exception as e:
253
+ return f"Error processing files: {str(e)}", None, None
254
 
255
  if not text1 or not text2 or "Error" in text1 or "Error" in text2:
256
  error_msg = ""
 
258
  error_msg += f"Document 1: {text1} "
259
  if "Error" in text2:
260
  error_msg += f"Document 2: {text2}"
261
+ return error_msg if error_msg else "Error extracting text from one or both documents.", None, None
262
 
263
+ overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
264
 
265
  concept_groups = group_similar_concepts(similar_pairs)
266
 
 
272
 
273
  for concept, pairs in concept_groups.items():
274
  if pairs:
275
+ output_html += f"<h5>οΏ½οΏ½οΏ½ {concept}:</h5>"
276
  for i, (sent1, sent2, score) in enumerate(pairs):
277
+ color = "#ff6666" if score >= 0.9 else "#ffaa44" if score >= 0.8 else "#ffcc66"
278
  output_html += f"""
279
  <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
280
  <p><b>πŸ“„ Document 1:</b> {sent1}</p>
 
284
  """
285
  else:
286
  output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
287
+ output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
288
  output_html += "</div>"
289
 
290
+ # Generate bar chart
291
+ barchart_image = create_similarity_barchart(similar_pairs)
292
+ summary_text = create_similarity_summary(overall_similarity, similar_pairs)
 
 
 
293
 
294
+ return output_html, summary_text, barchart_image
 
 
 
 
295
 
296
  # Create a clean Gradio interface
297
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
298
  gr.Markdown("""
299
+ # πŸ“„ Document Similarity Checker
300
  Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
301
  """)
302
 
 
312
  summary_output = gr.Markdown()
313
  output_html = gr.HTML(label="Detailed Similarities")
314
 
315
+ gr.Markdown("""
316
+ ### πŸ“Š Similarity Distribution
317
+ **Color Guide:**
318
+ - πŸ”΄ Very Strong Similarity (90-100%)
319
+ - 🟑 Strong Similarity (80-89%)
320
+ - 🟠 Good Similarity (70-79%)
321
+ """)
322
+ barchart_display = gr.HTML()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  # Define the processing function
325
  def process_files(file1, file2):
326
+ result_html, summary_text, barchart_img = similarity(file1, file2)
327
 
328
+ barchart_html = "<p>No similarity data available for visualization</p>"
329
+ if barchart_img:
330
+ barchart_html = f'<img src="{barchart_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
331
 
332
+ return result_html, summary_text, barchart_html
 
 
 
 
 
 
 
 
333
 
334
  # Connect the button
335
  submit_btn.click(
336
  fn=process_files,
337
  inputs=[file1, file2],
338
+ outputs=[output_html, summary_output, barchart_display]
339
  )
340
 
341
  # Launch the application
342
  if __name__ == "__main__":
343
  demo.launch(server_name="0.0.0.0", server_port=7860)
344
+
345
 
346