NaimaAqeel commited on
Commit
6982985
Β·
verified Β·
1 Parent(s): bca98ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -61
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- import docx # python-docx for DOCX extraction
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
@@ -9,17 +9,17 @@ import numpy as np
9
  from collections import defaultdict
10
  import base64
11
  from io import BytesIO
 
 
12
 
13
  # Try to import PyMuPDF with proper error handling
14
  pymupdf_available = False
15
  try:
16
- # Try importing PyMuPDF directly (the correct package)
17
  import pymupdf
18
  pymupdf_available = True
19
  print("PyMuPDF imported successfully")
20
  except ImportError:
21
  try:
22
- # Try the older import style
23
  import fitz
24
  pymupdf_available = True
25
  print("fitz imported successfully")
@@ -34,7 +34,6 @@ def extract_text_from_pdf(pdf_path):
34
  return "PDF processing not available. Please install PyMuPDF."
35
 
36
  try:
37
- # Use the correct import based on what's available
38
  if 'pymupdf' in globals():
39
  doc = pymupdf.open(pdf_path)
40
  else:
@@ -107,38 +106,148 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
107
  mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
108
  overall_similarity = mean_similarity.item()
109
 
110
- return overall_similarity, similar_pairs
111
 
112
- def create_heatmap_image(sentences1, sentences2, similarity_matrix):
113
- """Create a heatmap visualization of sentence similarities and return as base64"""
114
  if len(sentences1) == 0 or len(sentences2) == 0:
115
- return None
116
-
117
- # Create figure
118
- plt.figure(figsize=(10, 8))
119
- plt.imshow(similarity_matrix, cmap='viridis', interpolation='nearest')
120
- plt.colorbar(label='Similarity Score')
121
- plt.xlabel('Document 2 Sentences')
122
- plt.ylabel('Document 1 Sentences')
123
- plt.title('Sentence Similarity Heatmap')
124
- plt.tight_layout()
125
-
126
- # Save to buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  buf = BytesIO()
128
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
129
  plt.close()
130
  buf.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- # Convert to base64
133
- img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
134
- return f"data:image/png;base64,{img_base64}"
135
 
136
  def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
137
  """Group similar sentences by concept using keyword extraction"""
138
- # Simple keyword-based grouping
139
  concept_groups = defaultdict(list)
140
 
141
- # Define some common concepts for SOPs
142
  concepts = {
143
  'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
144
  'education': ['education', 'learn', 'course', 'degree', 'academic'],
@@ -160,76 +269,72 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
160
 
161
  def similarity(file1, file2):
162
  if file1 is None or file2 is None:
163
- return "Please upload both documents.", None
164
 
165
- # Extract text based on file type
166
  try:
167
  if file1.name.endswith('.pdf'):
168
  text1 = extract_text_from_pdf(file1.name)
169
  elif file1.name.endswith('.docx'):
170
  text1 = extract_text_from_docx(file1.name)
171
  else:
172
- return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None
173
 
174
  if file2.name.endswith('.pdf'):
175
  text2 = extract_text_from_pdf(file2.name)
176
  elif file2.name.endswith('.docx'):
177
  text2 = extract_text_from_docx(file2.name)
178
  else:
179
- return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None
180
  except Exception as e:
181
- return f"Error processing files: {str(e)}", None
182
 
183
- # Check if text extraction failed
184
  if not text1 or not text2 or "Error" in text1 or "Error" in text2:
185
  error_msg = ""
186
  if "Error" in text1:
187
  error_msg += f"Document 1: {text1} "
188
  if "Error" in text2:
189
  error_msg += f"Document 2: {text2}"
190
- return error_msg if error_msg else "Error extracting text from one or both documents.", None
191
 
192
- # Calculate similarity and get similar pairs
193
- overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
194
 
195
- # Group similar concepts
196
  concept_groups = group_similar_concepts(similar_pairs)
197
 
198
  # Prepare detailed output
199
- output_html = f"<h3>Overall Similarity Score: {overall_similarity:.2%}</h3>"
200
 
201
  if similar_pairs:
202
- output_html += "<h4>Similar Content Found:</h4>"
203
 
204
  for concept, pairs in concept_groups.items():
205
- if pairs: # Only show concepts with matches
206
- output_html += f"<h5>{concept.capitalize()}:</h5>"
207
  for i, (sent1, sent2, score) in enumerate(pairs):
 
208
  output_html += f"""
209
- <div style="background-color: #f0f8ff; padding: 10px; margin: 5px; border-radius: 5px; border-left: 4px solid #4CAF50;">
210
- <p><b>Document 1:</b> {sent1}</p>
211
- <p><b>Document 2:</b> {sent2}</p>
212
- <p><b>Similarity:</b> {score:.2%}</p>
213
  </div>
214
  """
215
  else:
216
- output_html += "<p>No significant similarities found above the threshold (70%).</p>"
 
 
217
 
218
- # Generate similarity heatmap if there are sentences
219
  sentences1 = preprocess_text(text1)
220
  sentences2 = preprocess_text(text2)
221
 
222
- heatmap_image = None
 
 
223
  if sentences1 and sentences2:
224
- # Get embeddings for visualization
225
- embeddings1 = model.encode(sentences1, convert_to_tensor=True)
226
- embeddings2 = model.encode(sentences2, convert_to_tensor=True)
227
- similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
228
-
229
- # Generate heatmap as base64 image
230
- heatmap_image = create_heatmap_image(sentences1, sentences2, similarity_matrix)
231
 
232
- return output_html, heatmap_image
233
 
234
  # Create a clean Gradio interface
235
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
@@ -247,27 +352,47 @@ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as d
247
 
248
  with gr.Column(scale=2):
249
  gr.Markdown("### Analysis Results")
250
- output_html = gr.HTML(label="Similarity Analysis")
251
- gr.Markdown("### Similarity Heatmap")
 
 
 
 
252
  heatmap_display = gr.HTML()
 
 
 
 
 
 
 
253
 
254
  # Define the processing function
255
  def process_files(file1, file2):
256
- result_html, heatmap_img = similarity(file1, file2)
257
 
258
- heatmap_html = ""
259
  if heatmap_img:
260
- heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 5px; padding: 5px;">'
 
 
 
 
 
 
 
 
261
 
262
- return result_html, heatmap_html
263
 
264
  # Connect the button
265
  submit_btn.click(
266
  fn=process_files,
267
  inputs=[file1, file2],
268
- outputs=[output_html, heatmap_display]
269
  )
270
 
271
  # Launch the application
272
  if __name__ == "__main__":
273
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import os
2
+ import docx
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
 
9
  from collections import defaultdict
10
  import base64
11
  from io import BytesIO
12
+ import pandas as pd
13
+ import seaborn as sns
14
 
15
  # Try to import PyMuPDF with proper error handling
16
  pymupdf_available = False
17
  try:
 
18
  import pymupdf
19
  pymupdf_available = True
20
  print("PyMuPDF imported successfully")
21
  except ImportError:
22
  try:
 
23
  import fitz
24
  pymupdf_available = True
25
  print("fitz imported successfully")
 
34
  return "PDF processing not available. Please install PyMuPDF."
35
 
36
  try:
 
37
  if 'pymupdf' in globals():
38
  doc = pymupdf.open(pdf_path)
39
  else:
 
106
  mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
107
  overall_similarity = mean_similarity.item()
108
 
109
+ return overall_similarity, similar_pairs, cosine_similarities.cpu().numpy()
110
 
111
+ def create_similarity_visualizations(sentences1, sentences2, similarity_matrix):
112
+ """Create multiple visualizations for similarity analysis"""
113
  if len(sentences1) == 0 or len(sentences2) == 0:
114
+ return None, None, None
115
+
116
+ visualizations = []
117
+
118
+ # 1. Improved Heatmap
119
+ plt.figure(figsize=(12, 10))
120
+
121
+ # Create a mask for values below threshold to make the heatmap clearer
122
+ mask = similarity_matrix < 0.5
123
+
124
+ # Use a diverging color palette for better contrast
125
+ ax = sns.heatmap(similarity_matrix,
126
+ mask=mask,
127
+ cmap='RdYlBu_r',
128
+ center=0.7,
129
+ xticklabels=False,
130
+ yticklabels=False,
131
+ cbar_kws={'label': 'Similarity Score', 'shrink': 0.8})
132
+
133
+ plt.title('Document Similarity Heatmap\n(Brighter colors = Higher similarity)', fontsize=14, pad=20)
134
+ plt.xlabel('Document 2 Sentences', fontsize=12)
135
+ plt.ylabel('Document 1 Sentences', fontsize=12)
136
+
137
+ buf = BytesIO()
138
+ plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
139
+ plt.close()
140
+ buf.seek(0)
141
+ heatmap_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
142
+ visualizations.append(heatmap_img)
143
+
144
+ # 2. Similarity Distribution Chart
145
+ plt.figure(figsize=(10, 6))
146
+
147
+ # Flatten the similarity matrix and filter out low similarities
148
+ flat_similarities = similarity_matrix.flatten()
149
+ flat_similarities = flat_similarities[flat_similarities > 0.3] # Only show meaningful similarities
150
+
151
+ plt.hist(flat_similarities, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
152
+ plt.axvline(x=0.7, color='red', linestyle='--', label='Similarity Threshold (70%)')
153
+ plt.xlabel('Similarity Score')
154
+ plt.ylabel('Frequency')
155
+ plt.title('Distribution of Sentence Similarities')
156
+ plt.legend()
157
+ plt.grid(True, alpha=0.3)
158
+
159
  buf = BytesIO()
160
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
161
  plt.close()
162
  buf.seek(0)
163
+ dist_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
164
+ visualizations.append(dist_img)
165
+
166
+ # 3. Top Similarity Pairs Bar Chart
167
+ plt.figure(figsize=(12, 8))
168
+
169
+ # Get top similarity scores and their positions
170
+ top_n = min(10, len(sentences1) * len(sentences2))
171
+ if top_n > 0:
172
+ # Flatten and get indices of top values
173
+ flat_indices = np.argsort(similarity_matrix.flatten())[-top_n:]
174
+ top_scores = similarity_matrix.flatten()[flat_indices]
175
+
176
+ # Convert flat indices to 2D indices
177
+ rows, cols = np.unravel_index(flat_indices, similarity_matrix.shape)
178
+
179
+ # Create labels
180
+ labels = [f"Sent {r+1} ↔ Sent {c+1}" for r, c in zip(rows, cols)]
181
+
182
+ plt.barh(range(len(top_scores)), top_scores, color='lightcoral')
183
+ plt.yticks(range(len(top_scores)), labels)
184
+ plt.xlabel('Similarity Score')
185
+ plt.title('Top 10 Most Similar Sentence Pairs')
186
+ plt.grid(True, alpha=0.3, axis='x')
187
+
188
+ buf = BytesIO()
189
+ plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
190
+ plt.close()
191
+ buf.seek(0)
192
+ top_pairs_img = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('utf-8')}"
193
+ visualizations.append(top_pairs_img)
194
+ else:
195
+ visualizations.append(None)
196
+
197
+ return visualizations
198
+
199
+ def create_similarity_summary(overall_similarity, similar_pairs):
200
+ """Create a text summary of the similarity analysis"""
201
+ summary = f"## πŸ“Š Similarity Summary\n\n"
202
+ summary += f"**Overall Similarity Score:** {overall_similarity:.2%}\n\n"
203
+
204
+ if similar_pairs:
205
+ summary += f"**Number of Similar Sentence Pairs:** {len(similar_pairs)}\n\n"
206
+
207
+ # Group by similarity ranges
208
+ high_sim = len([p for p in similar_pairs if p[2] >= 0.9])
209
+ med_sim = len([p for p in similar_pairs if 0.7 <= p[2] < 0.9])
210
+
211
+ summary += "**Similarity Breakdown:**\n"
212
+ summary += f"- High Similarity (β‰₯90%): {high_sim} pairs\n"
213
+ summary += f"- Medium Similarity (70-89%): {med_sim} pairs\n\n"
214
+
215
+ # Most common concepts
216
+ concepts = {
217
+ 'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
218
+ 'education': ['education', 'learn', 'course', 'degree', 'academic'],
219
+ 'experience': ['experience', 'work', 'job', 'intern', 'position'],
220
+ 'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
221
+ 'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
222
+ }
223
+
224
+ concept_counts = {concept: 0 for concept in concepts.keys()}
225
+ concept_counts['other'] = 0
226
+
227
+ for sent1, sent2, score in similar_pairs:
228
+ matched = False
229
+ for concept, keywords in concepts.items():
230
+ if any(keyword in sent1.lower() for keyword in keywords) or \
231
+ any(keyword in sent2.lower() for keyword in keywords):
232
+ concept_counts[concept] += 1
233
+ matched = True
234
+ break
235
+ if not matched:
236
+ concept_counts['other'] += 1
237
+
238
+ summary += "**Similar Content by Category:**\n"
239
+ for concept, count in concept_counts.items():
240
+ if count > 0:
241
+ summary += f"- {concept.capitalize()}: {count} pairs\n"
242
+ else:
243
+ summary += "No significant similarities found above the 70% threshold.\n"
244
 
245
+ return summary
 
 
246
 
247
  def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
248
  """Group similar sentences by concept using keyword extraction"""
 
249
  concept_groups = defaultdict(list)
250
 
 
251
  concepts = {
252
  'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
253
  'education': ['education', 'learn', 'course', 'degree', 'academic'],
 
269
 
270
  def similarity(file1, file2):
271
  if file1 is None or file2 is None:
272
+ return "Please upload both documents.", None, None, None, None
273
 
 
274
  try:
275
  if file1.name.endswith('.pdf'):
276
  text1 = extract_text_from_pdf(file1.name)
277
  elif file1.name.endswith('.docx'):
278
  text1 = extract_text_from_docx(file1.name)
279
  else:
280
+ return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None, None, None, None
281
 
282
  if file2.name.endswith('.pdf'):
283
  text2 = extract_text_from_pdf(file2.name)
284
  elif file2.name.endswith('.docx'):
285
  text2 = extract_text_from_docx(file2.name)
286
  else:
287
+ return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None, None, None, None
288
  except Exception as e:
289
+ return f"Error processing files: {str(e)}", None, None, None, None
290
 
 
291
  if not text1 or not text2 or "Error" in text1 or "Error" in text2:
292
  error_msg = ""
293
  if "Error" in text1:
294
  error_msg += f"Document 1: {text1} "
295
  if "Error" in text2:
296
  error_msg += f"Document 2: {text2}"
297
+ return error_msg if error_msg else "Error extracting text from one or both documents.", None, None, None, None
298
 
299
+ overall_similarity, similar_pairs, similarity_matrix = calculate_cosine_similarity(text1, text2)
 
300
 
 
301
  concept_groups = group_similar_concepts(similar_pairs)
302
 
303
  # Prepare detailed output
304
+ output_html = f"<h3>Overall Similarity Score: <span style='color: #4CAF50;'>{overall_similarity:.2%}</span></h3>"
305
 
306
  if similar_pairs:
307
+ output_html += f"<h4>Found {len(similar_pairs)} similar sentence pairs:</h4>"
308
 
309
  for concept, pairs in concept_groups.items():
310
+ if pairs:
311
+ output_html += f"<h5>πŸ” {concept.capitalize()}:</h5>"
312
  for i, (sent1, sent2, score) in enumerate(pairs):
313
+ color = "#4CAF50" if score >= 0.9 else "#FF9800" if score >= 0.7 else "#F44336"
314
  output_html += f"""
315
+ <div style="background-color: #f9f9f9; padding: 12px; margin: 8px; border-radius: 8px; border-left: 5px solid {color};">
316
+ <p><b>πŸ“„ Document 1:</b> {sent1}</p>
317
+ <p><b>πŸ“„ Document 2:</b> {sent2}</p>
318
+ <p><b>Similarity:</b> <span style='color: {color}; font-weight: bold;'>{score:.2%}</span></p>
319
  </div>
320
  """
321
  else:
322
+ output_html += "<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; border-left: 5px solid #ffc107;'>"
323
+ output_html += "<p>⚠️ No significant similarities found above the threshold (70%).</p>"
324
+ output_html += "</div>"
325
 
326
+ # Generate visualizations
327
  sentences1 = preprocess_text(text1)
328
  sentences2 = preprocess_text(text2)
329
 
330
+ visualizations = [None, None, None]
331
+ summary_text = ""
332
+
333
  if sentences1 and sentences2:
334
+ visualizations = create_similarity_visualizations(sentences1, sentences2, similarity_matrix)
335
+ summary_text = create_similarity_summary(overall_similarity, similar_pairs)
 
 
 
 
 
336
 
337
+ return output_html, summary_text, visualizations[0], visualizations[1], visualizations[2]
338
 
339
  # Create a clean Gradio interface
340
  with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
 
352
 
353
  with gr.Column(scale=2):
354
  gr.Markdown("### Analysis Results")
355
+ summary_output = gr.Markdown()
356
+ output_html = gr.HTML(label="Detailed Similarities")
357
+
358
+ with gr.Row():
359
+ with gr.Column():
360
+ gr.Markdown("### πŸ“ˆ Similarity Heatmap")
361
  heatmap_display = gr.HTML()
362
+ with gr.Column():
363
+ gr.Markdown("### πŸ“Š Similarity Distribution")
364
+ dist_display = gr.HTML()
365
+
366
+ with gr.Row():
367
+ gr.Markdown("### πŸ” Top Similar Pairs")
368
+ top_pairs_display = gr.HTML()
369
 
370
  # Define the processing function
371
  def process_files(file1, file2):
372
+ result_html, summary_text, heatmap_img, dist_img, top_pairs_img = similarity(file1, file2)
373
 
374
+ heatmap_html = "<p>No visualization available</p>"
375
  if heatmap_img:
376
+ heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
377
+
378
+ dist_html = "<p>No visualization available</p>"
379
+ if dist_img:
380
+ dist_html = f'<img src="{dist_img}" alt="Similarity Distribution" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
381
+
382
+ top_pairs_html = "<p>No visualization available</p>"
383
+ if top_pairs_img:
384
+ top_pairs_html = f'<img src="{top_pairs_img}" alt="Top Similar Pairs" style="max-width: 100%; border: 1px solid #ddd; border-radius: 8px; padding: 5px;">'
385
 
386
+ return result_html, summary_text, heatmap_html, dist_html, top_pairs_html
387
 
388
  # Connect the button
389
  submit_btn.click(
390
  fn=process_files,
391
  inputs=[file1, file2],
392
+ outputs=[output_html, summary_output, heatmap_display, dist_display, top_pairs_display]
393
  )
394
 
395
  # Launch the application
396
  if __name__ == "__main__":
397
+ demo.launch(server_name="0.0.0.0", server_port=7860)
398
+