NaimaAqeel commited on
Commit
9ddc9f6
·
verified ·
1 Parent(s): a2ac660

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -52
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import fitz # PyMuPDF for PDF extraction
3
  import docx # python-docx for DOCX extraction
4
  from sentence_transformers import SentenceTransformer, util
5
  import gradio as gr
@@ -8,20 +7,47 @@ from typing import List, Tuple, Dict
8
  import matplotlib.pyplot as plt
9
  import numpy as np
10
  from collections import defaultdict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Initialize the SentenceTransformer model
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
  def extract_text_from_pdf(pdf_path):
 
 
 
16
  try:
17
- doc = fitz.open(pdf_path)
 
 
 
 
 
 
18
  text = ""
19
  for page in doc:
20
  text += page.get_text()
21
  return text
22
  except Exception as e:
23
  print(f"Error extracting text from PDF: {str(e)}")
24
- return ""
25
 
26
  def extract_text_from_docx(docx_path):
27
  try:
@@ -30,10 +56,13 @@ def extract_text_from_docx(docx_path):
30
  return text
31
  except Exception as e:
32
  print(f"Error extracting text from DOCX: {str(e)}")
33
- return ""
34
 
35
  def preprocess_text(text: str) -> List[str]:
36
  """Split text into sentences and clean them"""
 
 
 
37
  # Split into sentences using regex
38
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
39
  # Clean sentences
@@ -46,6 +75,9 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
46
  sentences1 = preprocess_text(doc1)
47
  sentences2 = preprocess_text(doc2)
48
 
 
 
 
49
  # Get embeddings for all sentences
50
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
51
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)
@@ -70,33 +102,40 @@ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple
70
  similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
71
 
72
  # Calculate overall similarity
73
- if len(sentences1) > 0 and len(sentences2) > 0:
74
- # Use max similarity for each sentence and average
75
- max_similarities1 = cosine_similarities.max(dim=1)[0]
76
- max_similarities2 = cosine_similarities.max(dim=0)[0]
77
- mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
78
- overall_similarity = mean_similarity.item()
79
- else:
80
- overall_similarity = 0.0
81
 
82
  return overall_similarity, similar_pairs
83
 
84
- def visualize_similarity(sentences1, sentences2, similarity_matrix):
85
- """Create a heatmap visualization of sentence similarities"""
 
 
 
 
86
  plt.figure(figsize=(10, 8))
87
- plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
88
  plt.colorbar(label='Similarity Score')
89
  plt.xlabel('Document 2 Sentences')
90
  plt.ylabel('Document 1 Sentences')
91
  plt.title('Sentence Similarity Heatmap')
92
  plt.tight_layout()
93
- plt.savefig('similarity_heatmap.png')
 
 
 
94
  plt.close()
95
- return 'similarity_heatmap.png'
 
 
 
 
96
 
97
  def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
98
  """Group similar sentences by concept using keyword extraction"""
99
- # Simple keyword-based grouping (could be enhanced with NLP techniques)
100
  concept_groups = defaultdict(list)
101
 
102
  # Define some common concepts for SOPs
@@ -120,9 +159,35 @@ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[
120
  return concept_groups
121
 
122
  def similarity(file1, file2):
 
 
 
123
  # Extract text based on file type
124
- text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
125
- text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  # Calculate similarity and get similar pairs
128
  overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
@@ -141,63 +206,68 @@ def similarity(file1, file2):
141
  output_html += f"<h5>{concept.capitalize()}:</h5>"
142
  for i, (sent1, sent2, score) in enumerate(pairs):
143
  output_html += f"""
144
- <div style="background-color: #f0f0f0; padding: 10px; margin: 5px; border-radius: 5px;">
145
  <p><b>Document 1:</b> {sent1}</p>
146
  <p><b>Document 2:</b> {sent2}</p>
147
  <p><b>Similarity:</b> {score:.2%}</p>
148
  </div>
149
  """
150
  else:
151
- output_html += "<p>No significant similarities found above the threshold.</p>"
152
 
153
  # Generate similarity heatmap if there are sentences
154
  sentences1 = preprocess_text(text1)
155
  sentences2 = preprocess_text(text2)
156
 
 
157
  if sentences1 and sentences2:
158
  # Get embeddings for visualization
159
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
160
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)
161
  similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
162
 
163
- # Generate and save heatmap
164
- heatmap_path = visualize_similarity(sentences1, sentences2, similarity_matrix)
165
- output_html += f'<h4>Similarity Heatmap:</h4><img src="/file={heatmap_path}" alt="Similarity Heatmap" style="max-width: 100%;">'
166
 
167
- return output_html
168
 
169
- # Create a Gradio interface with enhanced features
170
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
171
  gr.Markdown("""
172
- # Document Similarity Checker with Detailed Analysis
173
- Upload two documents to compare their content and identify specific similarities.
174
  """)
175
 
176
  with gr.Row():
177
- with gr.Column():
178
- file1 = gr.File(label="Upload Document 1", file_types=[".pdf", ".docx"])
179
- file2 = gr.File(label="Upload Document 2", file_types=[".pdf", ".docx"])
180
- submit = gr.Button("Compare Documents", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- with gr.Column():
183
- output = gr.HTML(label="Similarity Analysis Results")
184
-
185
- # Add examples for users to try
186
- gr.Examples(
187
- examples=[
188
- [os.path.join(os.path.dirname(__file__), "sample1.pdf"), os.path.join(os.path.dirname(__file__), "sample2.pdf")],
189
- [os.path.join(os.path.dirname(__file__), "sample1.docx"), os.path.join(os.path.dirname(__file__), "sample2.docx")]
190
- ],
191
  inputs=[file1, file2],
192
- outputs=output,
193
- fn=similarity,
194
- cache_examples=False
195
  )
196
-
197
- submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
198
-
199
- # Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
200
- port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
201
 
 
202
  if __name__ == "__main__":
203
- demo.launch(server_name="0.0.0.0", server_port=port)
 
1
  import os
 
2
  import docx # python-docx for DOCX extraction
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
 
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  from collections import defaultdict
10
+ import base64
11
+ from io import BytesIO
12
+
13
+ # Try to import PyMuPDF with proper error handling
14
+ pymupdf_available = False
15
+ try:
16
+ # Try importing PyMuPDF directly (the correct package)
17
+ import pymupdf
18
+ pymupdf_available = True
19
+ print("PyMuPDF imported successfully")
20
+ except ImportError:
21
+ try:
22
+ # Try the older import style
23
+ import fitz
24
+ pymupdf_available = True
25
+ print("fitz imported successfully")
26
+ except ImportError:
27
+ print("PyMuPDF/fitz is not available. PDF extraction will not work.")
28
 
29
  # Initialize the SentenceTransformer model
30
  model = SentenceTransformer('all-MiniLM-L6-v2')
31
 
32
  def extract_text_from_pdf(pdf_path):
33
+ if not pymupdf_available:
34
+ return "PDF processing not available. Please install PyMuPDF."
35
+
36
  try:
37
+ # Use the correct import based on what's available
38
+ if 'pymupdf' in globals():
39
+ doc = pymupdf.open(pdf_path)
40
+ else:
41
+ import fitz
42
+ doc = fitz.open(pdf_path)
43
+
44
  text = ""
45
  for page in doc:
46
  text += page.get_text()
47
  return text
48
  except Exception as e:
49
  print(f"Error extracting text from PDF: {str(e)}")
50
+ return f"Error extracting PDF: {str(e)}"
51
 
52
  def extract_text_from_docx(docx_path):
53
  try:
 
56
  return text
57
  except Exception as e:
58
  print(f"Error extracting text from DOCX: {str(e)}")
59
+ return f"Error extracting DOCX: {str(e)}"
60
 
61
  def preprocess_text(text: str) -> List[str]:
62
  """Split text into sentences and clean them"""
63
+ if not text or text.strip() == "":
64
+ return []
65
+
66
  # Split into sentences using regex
67
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
68
  # Clean sentences
 
75
  sentences1 = preprocess_text(doc1)
76
  sentences2 = preprocess_text(doc2)
77
 
78
+ if not sentences1 or not sentences2:
79
+ return 0.0, []
80
+
81
  # Get embeddings for all sentences
82
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
83
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)
 
102
  similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
103
 
104
  # Calculate overall similarity
105
+ max_similarities1 = cosine_similarities.max(dim=1)[0]
106
+ max_similarities2 = cosine_similarities.max(dim=0)[0]
107
+ mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
108
+ overall_similarity = mean_similarity.item()
 
 
 
 
109
 
110
  return overall_similarity, similar_pairs
111
 
112
+ def create_heatmap_image(sentences1, sentences2, similarity_matrix):
113
+ """Create a heatmap visualization of sentence similarities and return as base64"""
114
+ if len(sentences1) == 0 or len(sentences2) == 0:
115
+ return None
116
+
117
+ # Create figure
118
  plt.figure(figsize=(10, 8))
119
+ plt.imshow(similarity_matrix, cmap='viridis', interpolation='nearest')
120
  plt.colorbar(label='Similarity Score')
121
  plt.xlabel('Document 2 Sentences')
122
  plt.ylabel('Document 1 Sentences')
123
  plt.title('Sentence Similarity Heatmap')
124
  plt.tight_layout()
125
+
126
+ # Save to buffer
127
+ buf = BytesIO()
128
+ plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
129
  plt.close()
130
+ buf.seek(0)
131
+
132
+ # Convert to base64
133
+ img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
134
+ return f"data:image/png;base64,{img_base64}"
135
 
136
  def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
137
  """Group similar sentences by concept using keyword extraction"""
138
+ # Simple keyword-based grouping
139
  concept_groups = defaultdict(list)
140
 
141
  # Define some common concepts for SOPs
 
159
  return concept_groups
160
 
161
  def similarity(file1, file2):
162
+ if file1 is None or file2 is None:
163
+ return "Please upload both documents.", None
164
+
165
  # Extract text based on file type
166
+ try:
167
+ if file1.name.endswith('.pdf'):
168
+ text1 = extract_text_from_pdf(file1.name)
169
+ elif file1.name.endswith('.docx'):
170
+ text1 = extract_text_from_docx(file1.name)
171
+ else:
172
+ return "Unsupported file format for Document 1. Please upload PDF or DOCX.", None
173
+
174
+ if file2.name.endswith('.pdf'):
175
+ text2 = extract_text_from_pdf(file2.name)
176
+ elif file2.name.endswith('.docx'):
177
+ text2 = extract_text_from_docx(file2.name)
178
+ else:
179
+ return "Unsupported file format for Document 2. Please upload PDF or DOCX.", None
180
+ except Exception as e:
181
+ return f"Error processing files: {str(e)}", None
182
+
183
+ # Check if text extraction failed
184
+ if not text1 or not text2 or "Error" in text1 or "Error" in text2:
185
+ error_msg = ""
186
+ if "Error" in text1:
187
+ error_msg += f"Document 1: {text1} "
188
+ if "Error" in text2:
189
+ error_msg += f"Document 2: {text2}"
190
+ return error_msg if error_msg else "Error extracting text from one or both documents.", None
191
 
192
  # Calculate similarity and get similar pairs
193
  overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
 
206
  output_html += f"<h5>{concept.capitalize()}:</h5>"
207
  for i, (sent1, sent2, score) in enumerate(pairs):
208
  output_html += f"""
209
+ <div style="background-color: #f0f8ff; padding: 10px; margin: 5px; border-radius: 5px; border-left: 4px solid #4CAF50;">
210
  <p><b>Document 1:</b> {sent1}</p>
211
  <p><b>Document 2:</b> {sent2}</p>
212
  <p><b>Similarity:</b> {score:.2%}</p>
213
  </div>
214
  """
215
  else:
216
+ output_html += "<p>No significant similarities found above the threshold (70%).</p>"
217
 
218
  # Generate similarity heatmap if there are sentences
219
  sentences1 = preprocess_text(text1)
220
  sentences2 = preprocess_text(text2)
221
 
222
+ heatmap_image = None
223
  if sentences1 and sentences2:
224
  # Get embeddings for visualization
225
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
226
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)
227
  similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
228
 
229
+ # Generate heatmap as base64 image
230
+ heatmap_image = create_heatmap_image(sentences1, sentences2, similarity_matrix)
 
231
 
232
+ return output_html, heatmap_image
233
 
234
+ # Create a clean Gradio interface
235
+ with gr.Blocks(title="Document Similarity Checker", theme=gr.themes.Soft()) as demo:
236
  gr.Markdown("""
237
+ # 📄 Document Similarity Checker with Detailed Analysis
238
+ Upload two documents (PDF or DOCX) to compare their content and identify specific similarities.
239
  """)
240
 
241
  with gr.Row():
242
+ with gr.Column(scale=1):
243
+ gr.Markdown("### Upload Documents")
244
+ file1 = gr.File(label="Document 1", file_types=[".pdf", ".docx"])
245
+ file2 = gr.File(label="Document 2", file_types=[".pdf", ".docx"])
246
+ submit_btn = gr.Button("Compare Documents", variant="primary")
247
+
248
+ with gr.Column(scale=2):
249
+ gr.Markdown("### Analysis Results")
250
+ output_html = gr.HTML(label="Similarity Analysis")
251
+ gr.Markdown("### Similarity Heatmap")
252
+ heatmap_display = gr.HTML()
253
+
254
+ # Define the processing function
255
+ def process_files(file1, file2):
256
+ result_html, heatmap_img = similarity(file1, file2)
257
+
258
+ heatmap_html = ""
259
+ if heatmap_img:
260
+ heatmap_html = f'<img src="{heatmap_img}" alt="Similarity Heatmap" style="max-width: 100%; border: 1px solid #ddd; border-radius: 5px; padding: 5px;">'
261
 
262
+ return result_html, heatmap_html
263
+
264
+ # Connect the button
265
+ submit_btn.click(
266
+ fn=process_files,
 
 
 
 
267
  inputs=[file1, file2],
268
+ outputs=[output_html, heatmap_display]
 
 
269
  )
 
 
 
 
 
270
 
271
+ # Launch the application
272
  if __name__ == "__main__":
273
+ demo.launch(server_name="0.0.0.0", server_port=7860)