NaimaAqeel commited on
Commit
2f1cc59
·
verified ·
1 Parent(s): fd3c2de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -76
app.py CHANGED
@@ -1,18 +1,17 @@
1
  import os
2
- import fitz # PyMuPDF
3
- import docx
4
  from sentence_transformers import SentenceTransformer, util
5
  import gradio as gr
6
- import torch
 
7
  import matplotlib.pyplot as plt
8
- import seaborn as sns
9
- from io import BytesIO
10
- import base64
11
 
12
- # ----------------- Initialize model -----------------
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
- # ----------------- Text Extraction -----------------
16
  def extract_text_from_pdf(pdf_path):
17
  try:
18
  doc = fitz.open(pdf_path)
@@ -21,90 +20,184 @@ def extract_text_from_pdf(pdf_path):
21
  text += page.get_text()
22
  return text
23
  except Exception as e:
24
- return f"Error extracting PDF: {str(e)}"
 
25
 
26
  def extract_text_from_docx(docx_path):
27
  try:
28
  doc = docx.Document(docx_path)
29
- return "\n".join([para.text for para in doc.paragraphs])
 
30
  except Exception as e:
31
- return f"Error extracting DOCX: {str(e)}"
32
-
33
- # ----------------- Chunk Similarity -----------------
34
- def chunk_text(text, chunk_size=5):
35
- sentences = [s.strip() for s in text.split('.') if s.strip()]
36
- chunks = ['. '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
37
- return chunks
38
 
39
- def get_similar_chunks(doc1, doc2, chunk_size=5, threshold=0.7):
40
- chunks1 = chunk_text(doc1, chunk_size)
41
- chunks2 = chunk_text(doc2, chunk_size)
42
-
43
- embeddings1 = model.encode(chunks1, convert_to_tensor=True)
44
- embeddings2 = model.encode(chunks2, convert_to_tensor=True)
 
45
 
 
 
 
 
 
 
 
 
 
 
 
46
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
47
-
 
48
  similar_pairs = []
49
- for i, row in enumerate(cosine_similarities):
50
- max_val, idx = row.max(0)
51
- if max_val.item() >= threshold:
52
- similar_pairs.append({
53
- "doc1_chunk": chunks1[i],
54
- "doc2_chunk": chunks2[idx],
55
- "similarity": round(max_val.item(), 3)
56
- })
57
- return similar_pairs, cosine_similarities.cpu().numpy(), chunks1, chunks2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # ----------------- Generate Heatmap -----------------
60
- def create_heatmap(sim_matrix, chunks1, chunks2):
61
- plt.figure(figsize=(10,8))
62
- sns.heatmap(sim_matrix, xticklabels=[f"C{i+1}" for i in range(len(chunks2))],
63
- yticklabels=[f"C{i+1}" for i in range(len(chunks1))], cmap="viridis")
64
- plt.xlabel("Document 2 Chunks")
65
- plt.ylabel("Document 1 Chunks")
66
- plt.title("Similarity Heatmap")
67
-
68
- buf = BytesIO()
69
- plt.savefig(buf, format="png")
70
- buf.seek(0)
71
- encoded = base64.b64encode(buf.getvalue()).decode()
72
- buf.close()
73
  plt.close()
74
- return f"data:image/png;base64,{encoded}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # ----------------- Main Function -----------------
77
- def similarity_with_details(file1, file2, threshold=0.7):
78
  text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
79
  text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
80
-
81
- similar_pairs, sim_matrix, chunks1, chunks2 = get_similar_chunks(text1, text2, threshold=threshold)
82
 
83
- if not similar_pairs:
84
- return "No significant similarity found.", None
85
-
86
- result = ""
87
- for i, pair in enumerate(similar_pairs, 1):
88
- result += f"### Similar Chunk {i} (Score: {pair['similarity']})\n"
89
- result += f"**Doc1:** {pair['doc1_chunk']}\n"
90
- result += f"**Doc2:** {pair['doc2_chunk']}\n\n"
91
-
92
- heatmap_img = create_heatmap(sim_matrix, chunks1, chunks2)
93
- return result, heatmap_img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # ----------------- Gradio Interface -----------------
96
- with gr.Blocks() as demo:
97
- gr.Markdown("## 📄 Document Similarity Checker with Highlighted Chunks")
 
 
 
 
98
  with gr.Row():
99
- file1 = gr.File(label="Upload Document 1")
100
- file2 = gr.File(label="Upload Document 2")
101
- threshold = gr.Slider(0, 1, value=0.7, step=0.05, label="Similarity Threshold")
102
- output_text = gr.Markdown()
103
- output_heatmap = gr.Image()
104
- submit = gr.Button("Check Similarity")
105
-
106
- submit.click(fn=similarity_with_details, inputs=[file1, file2, threshold], outputs=[output_text, output_heatmap])
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- # Run the Gradio app
109
  port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
110
- demo.launch(server_port=port, server_name="0.0.0.0")
 
 
 
1
  import os
2
+ import fitz # PyMuPDF for PDF extraction
3
+ import docx # python-docx for DOCX extraction
4
  from sentence_transformers import SentenceTransformer, util
5
  import gradio as gr
6
+ import re
7
+ from typing import List, Tuple, Dict
8
  import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from collections import defaultdict
 
11
 
12
+ # Initialize the SentenceTransformer model
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
 
15
  def extract_text_from_pdf(pdf_path):
16
  try:
17
  doc = fitz.open(pdf_path)
 
20
  text += page.get_text()
21
  return text
22
  except Exception as e:
23
+ print(f"Error extracting text from PDF: {str(e)}")
24
+ return ""
25
 
26
  def extract_text_from_docx(docx_path):
27
  try:
28
  doc = docx.Document(docx_path)
29
+ text = "\n".join([para.text for para in doc.paragraphs])
30
+ return text
31
  except Exception as e:
32
+ print(f"Error extracting text from DOCX: {str(e)}")
33
+ return ""
 
 
 
 
 
34
 
35
+ def preprocess_text(text: str) -> List[str]:
36
+ """Split text into sentences and clean them"""
37
+ # Split into sentences using regex
38
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
39
+ # Clean sentences
40
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
41
+ return sentences
42
 
43
+ def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]:
44
+ """Calculate similarity score and return similar sentence pairs"""
45
+ # Preprocess texts into sentences
46
+ sentences1 = preprocess_text(doc1)
47
+ sentences2 = preprocess_text(doc2)
48
+
49
+ # Get embeddings for all sentences
50
+ embeddings1 = model.encode(sentences1, convert_to_tensor=True)
51
+ embeddings2 = model.encode(sentences2, convert_to_tensor=True)
52
+
53
+ # Calculate cosine similarities between all sentence pairs
54
  cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
55
+
56
+ # Find the most similar sentences
57
  similar_pairs = []
58
+ threshold = 0.7 # Similarity threshold for highlighting
59
+
60
+ for i in range(len(sentences1)):
61
+ max_similarity = 0
62
+ best_match_idx = -1
63
+
64
+ for j in range(len(sentences2)):
65
+ if cosine_similarities[i][j] > max_similarity:
66
+ max_similarity = cosine_similarities[i][j]
67
+ best_match_idx = j
68
+
69
+ if max_similarity > threshold and best_match_idx != -1:
70
+ similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
71
+
72
+ # Calculate overall similarity
73
+ if len(sentences1) > 0 and len(sentences2) > 0:
74
+ # Use max similarity for each sentence and average
75
+ max_similarities1 = cosine_similarities.max(dim=1)[0]
76
+ max_similarities2 = cosine_similarities.max(dim=0)[0]
77
+ mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
78
+ overall_similarity = mean_similarity.item()
79
+ else:
80
+ overall_similarity = 0.0
81
+
82
+ return overall_similarity, similar_pairs
83
 
84
+ def visualize_similarity(sentences1, sentences2, similarity_matrix):
85
+ """Create a heatmap visualization of sentence similarities"""
86
+ plt.figure(figsize=(10, 8))
87
+ plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
88
+ plt.colorbar(label='Similarity Score')
89
+ plt.xlabel('Document 2 Sentences')
90
+ plt.ylabel('Document 1 Sentences')
91
+ plt.title('Sentence Similarity Heatmap')
92
+ plt.tight_layout()
93
+ plt.savefig('similarity_heatmap.png')
 
 
 
 
94
  plt.close()
95
+ return 'similarity_heatmap.png'
96
+
97
+ def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
98
+ """Group similar sentences by concept using keyword extraction"""
99
+ # Simple keyword-based grouping (could be enhanced with NLP techniques)
100
+ concept_groups = defaultdict(list)
101
+
102
+ # Define some common concepts for SOPs
103
+ concepts = {
104
+ 'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
105
+ 'education': ['education', 'learn', 'course', 'degree', 'academic'],
106
+ 'experience': ['experience', 'work', 'job', 'intern', 'position'],
107
+ 'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
108
+ 'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
109
+ }
110
+
111
+ for sent1, sent2, score in similar_pairs:
112
+ matched_concept = 'other'
113
+ for concept, keywords in concepts.items():
114
+ if any(keyword in sent1.lower() for keyword in keywords) or \
115
+ any(keyword in sent2.lower() for keyword in keywords):
116
+ matched_concept = concept
117
+ break
118
+ concept_groups[matched_concept].append((sent1, sent2, score))
119
+
120
+ return concept_groups
121
 
122
+ def similarity(file1, file2):
123
+ # Extract text based on file type
124
  text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
125
  text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
 
 
126
 
127
+ # Calculate similarity and get similar pairs
128
+ overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
129
+
130
+ # Group similar concepts
131
+ concept_groups = group_similar_concepts(similar_pairs)
132
+
133
+ # Prepare detailed output
134
+ output_html = f"<h3>Overall Similarity Score: {overall_similarity:.2%}</h3>"
135
+
136
+ if similar_pairs:
137
+ output_html += "<h4>Similar Content Found:</h4>"
138
+
139
+ for concept, pairs in concept_groups.items():
140
+ if pairs: # Only show concepts with matches
141
+ output_html += f"<h5>{concept.capitalize()}:</h5>"
142
+ for i, (sent1, sent2, score) in enumerate(pairs):
143
+ output_html += f"""
144
+ <div style="background-color: #f0f0f0; padding: 10px; margin: 5px; border-radius: 5px;">
145
+ <p><b>Document 1:</b> {sent1}</p>
146
+ <p><b>Document 2:</b> {sent2}</p>
147
+ <p><b>Similarity:</b> {score:.2%}</p>
148
+ </div>
149
+ """
150
+ else:
151
+ output_html += "<p>No significant similarities found above the threshold.</p>"
152
+
153
+ # Generate similarity heatmap if there are sentences
154
+ sentences1 = preprocess_text(text1)
155
+ sentences2 = preprocess_text(text2)
156
+
157
+ if sentences1 and sentences2:
158
+ # Get embeddings for visualization
159
+ embeddings1 = model.encode(sentences1, convert_to_tensor=True)
160
+ embeddings2 = model.encode(sentences2, convert_to_tensor=True)
161
+ similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
162
+
163
+ # Generate and save heatmap
164
+ heatmap_path = visualize_similarity(sentences1, sentences2, similarity_matrix)
165
+ output_html += f'<h4>Similarity Heatmap:</h4><img src="/file={heatmap_path}" alt="Similarity Heatmap" style="max-width: 100%;">'
166
+
167
+ return output_html
168
 
169
+ # Create a Gradio interface with enhanced features
170
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
171
+ gr.Markdown("""
172
+ # Document Similarity Checker with Detailed Analysis
173
+ Upload two documents to compare their content and identify specific similarities.
174
+ """)
175
+
176
  with gr.Row():
177
+ with gr.Column():
178
+ file1 = gr.File(label="Upload Document 1", file_types=[".pdf", ".docx"])
179
+ file2 = gr.File(label="Upload Document 2", file_types=[".pdf", ".docx"])
180
+ submit = gr.Button("Compare Documents", variant="primary")
181
+
182
+ with gr.Column():
183
+ output = gr.HTML(label="Similarity Analysis Results")
184
+
185
+ # Add examples for users to try
186
+ gr.Examples(
187
+ examples=[
188
+ [os.path.join(os.path.dirname(__file__), "sample1.pdf"), os.path.join(os.path.dirname(__file__), "sample2.pdf")],
189
+ [os.path.join(os.path.dirname(__file__), "sample1.docx"), os.path.join(os.path.dirname(__file__), "sample2.docx")]
190
+ ],
191
+ inputs=[file1, file2],
192
+ outputs=output,
193
+ fn=similarity,
194
+ cache_examples=False
195
+ )
196
+
197
+ submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
198
 
199
+ # Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
200
  port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
201
+
202
+ if __name__ == "__main__":
203
+ demo.launch(server_name="0.0.0.0", server_port=port)