Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,17 @@
|
|
1 |
import os
|
2 |
-
import fitz # PyMuPDF
|
3 |
-
import docx
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
import gradio as gr
|
6 |
-
import
|
|
|
7 |
import matplotlib.pyplot as plt
|
8 |
-
import
|
9 |
-
from
|
10 |
-
import base64
|
11 |
|
12 |
-
#
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
|
15 |
-
# ----------------- Text Extraction -----------------
|
16 |
def extract_text_from_pdf(pdf_path):
|
17 |
try:
|
18 |
doc = fitz.open(pdf_path)
|
@@ -21,90 +20,184 @@ def extract_text_from_pdf(pdf_path):
|
|
21 |
text += page.get_text()
|
22 |
return text
|
23 |
except Exception as e:
|
24 |
-
|
|
|
25 |
|
26 |
def extract_text_from_docx(docx_path):
|
27 |
try:
|
28 |
doc = docx.Document(docx_path)
|
29 |
-
|
|
|
30 |
except Exception as e:
|
31 |
-
|
32 |
-
|
33 |
-
# ----------------- Chunk Similarity -----------------
|
34 |
-
def chunk_text(text, chunk_size=5):
|
35 |
-
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
36 |
-
chunks = ['. '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
|
37 |
-
return chunks
|
38 |
|
39 |
-
def
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
47 |
-
|
|
|
48 |
similar_pairs = []
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
plt.figure(figsize=(10,8))
|
62 |
-
|
63 |
-
|
64 |
-
plt.xlabel(
|
65 |
-
plt.ylabel(
|
66 |
-
plt.title(
|
67 |
-
|
68 |
-
|
69 |
-
plt.savefig(buf, format="png")
|
70 |
-
buf.seek(0)
|
71 |
-
encoded = base64.b64encode(buf.getvalue()).decode()
|
72 |
-
buf.close()
|
73 |
plt.close()
|
74 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
|
79 |
text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
|
80 |
-
|
81 |
-
similar_pairs, sim_matrix, chunks1, chunks2 = get_similar_chunks(text1, text2, threshold=threshold)
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
#
|
96 |
-
with gr.Blocks() as demo:
|
97 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
98 |
with gr.Row():
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
#
|
109 |
port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
|
110 |
-
|
|
|
|
|
|
1 |
import os
|
2 |
+
import fitz # PyMuPDF for PDF extraction
|
3 |
+
import docx # python-docx for DOCX extraction
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
import gradio as gr
|
6 |
+
import re
|
7 |
+
from typing import List, Tuple, Dict
|
8 |
import matplotlib.pyplot as plt
|
9 |
+
import numpy as np
|
10 |
+
from collections import defaultdict
|
|
|
11 |
|
12 |
+
# Initialize the SentenceTransformer model
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
|
|
|
15 |
def extract_text_from_pdf(pdf_path):
|
16 |
try:
|
17 |
doc = fitz.open(pdf_path)
|
|
|
20 |
text += page.get_text()
|
21 |
return text
|
22 |
except Exception as e:
|
23 |
+
print(f"Error extracting text from PDF: {str(e)}")
|
24 |
+
return ""
|
25 |
|
26 |
def extract_text_from_docx(docx_path):
|
27 |
try:
|
28 |
doc = docx.Document(docx_path)
|
29 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
30 |
+
return text
|
31 |
except Exception as e:
|
32 |
+
print(f"Error extracting text from DOCX: {str(e)}")
|
33 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
def preprocess_text(text: str) -> List[str]:
|
36 |
+
"""Split text into sentences and clean them"""
|
37 |
+
# Split into sentences using regex
|
38 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
39 |
+
# Clean sentences
|
40 |
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
|
41 |
+
return sentences
|
42 |
|
43 |
+
def calculate_cosine_similarity(doc1: str, doc2: str) -> Tuple[float, List[Tuple[str, str, float]]]:
|
44 |
+
"""Calculate similarity score and return similar sentence pairs"""
|
45 |
+
# Preprocess texts into sentences
|
46 |
+
sentences1 = preprocess_text(doc1)
|
47 |
+
sentences2 = preprocess_text(doc2)
|
48 |
+
|
49 |
+
# Get embeddings for all sentences
|
50 |
+
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
51 |
+
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
|
52 |
+
|
53 |
+
# Calculate cosine similarities between all sentence pairs
|
54 |
cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
|
55 |
+
|
56 |
+
# Find the most similar sentences
|
57 |
similar_pairs = []
|
58 |
+
threshold = 0.7 # Similarity threshold for highlighting
|
59 |
+
|
60 |
+
for i in range(len(sentences1)):
|
61 |
+
max_similarity = 0
|
62 |
+
best_match_idx = -1
|
63 |
+
|
64 |
+
for j in range(len(sentences2)):
|
65 |
+
if cosine_similarities[i][j] > max_similarity:
|
66 |
+
max_similarity = cosine_similarities[i][j]
|
67 |
+
best_match_idx = j
|
68 |
+
|
69 |
+
if max_similarity > threshold and best_match_idx != -1:
|
70 |
+
similar_pairs.append((sentences1[i], sentences2[best_match_idx], max_similarity.item()))
|
71 |
+
|
72 |
+
# Calculate overall similarity
|
73 |
+
if len(sentences1) > 0 and len(sentences2) > 0:
|
74 |
+
# Use max similarity for each sentence and average
|
75 |
+
max_similarities1 = cosine_similarities.max(dim=1)[0]
|
76 |
+
max_similarities2 = cosine_similarities.max(dim=0)[0]
|
77 |
+
mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
|
78 |
+
overall_similarity = mean_similarity.item()
|
79 |
+
else:
|
80 |
+
overall_similarity = 0.0
|
81 |
+
|
82 |
+
return overall_similarity, similar_pairs
|
83 |
|
84 |
+
def visualize_similarity(sentences1, sentences2, similarity_matrix):
|
85 |
+
"""Create a heatmap visualization of sentence similarities"""
|
86 |
+
plt.figure(figsize=(10, 8))
|
87 |
+
plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
|
88 |
+
plt.colorbar(label='Similarity Score')
|
89 |
+
plt.xlabel('Document 2 Sentences')
|
90 |
+
plt.ylabel('Document 1 Sentences')
|
91 |
+
plt.title('Sentence Similarity Heatmap')
|
92 |
+
plt.tight_layout()
|
93 |
+
plt.savefig('similarity_heatmap.png')
|
|
|
|
|
|
|
|
|
94 |
plt.close()
|
95 |
+
return 'similarity_heatmap.png'
|
96 |
+
|
97 |
+
def group_similar_concepts(similar_pairs: List[Tuple[str, str, float]]) -> Dict[str, List[Tuple[str, str, float]]]:
|
98 |
+
"""Group similar sentences by concept using keyword extraction"""
|
99 |
+
# Simple keyword-based grouping (could be enhanced with NLP techniques)
|
100 |
+
concept_groups = defaultdict(list)
|
101 |
+
|
102 |
+
# Define some common concepts for SOPs
|
103 |
+
concepts = {
|
104 |
+
'research': ['research', 'study', 'investigate', 'experiment', 'methodology'],
|
105 |
+
'education': ['education', 'learn', 'course', 'degree', 'academic'],
|
106 |
+
'experience': ['experience', 'work', 'job', 'intern', 'position'],
|
107 |
+
'goals': ['goal', 'objective', 'aim', 'purpose', 'aspiration'],
|
108 |
+
'skills': ['skill', 'ability', 'proficient', 'expertise', 'capability']
|
109 |
+
}
|
110 |
+
|
111 |
+
for sent1, sent2, score in similar_pairs:
|
112 |
+
matched_concept = 'other'
|
113 |
+
for concept, keywords in concepts.items():
|
114 |
+
if any(keyword in sent1.lower() for keyword in keywords) or \
|
115 |
+
any(keyword in sent2.lower() for keyword in keywords):
|
116 |
+
matched_concept = concept
|
117 |
+
break
|
118 |
+
concept_groups[matched_concept].append((sent1, sent2, score))
|
119 |
+
|
120 |
+
return concept_groups
|
121 |
|
122 |
+
def similarity(file1, file2):
|
123 |
+
# Extract text based on file type
|
124 |
text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
|
125 |
text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
|
|
|
|
|
126 |
|
127 |
+
# Calculate similarity and get similar pairs
|
128 |
+
overall_similarity, similar_pairs = calculate_cosine_similarity(text1, text2)
|
129 |
+
|
130 |
+
# Group similar concepts
|
131 |
+
concept_groups = group_similar_concepts(similar_pairs)
|
132 |
+
|
133 |
+
# Prepare detailed output
|
134 |
+
output_html = f"<h3>Overall Similarity Score: {overall_similarity:.2%}</h3>"
|
135 |
+
|
136 |
+
if similar_pairs:
|
137 |
+
output_html += "<h4>Similar Content Found:</h4>"
|
138 |
+
|
139 |
+
for concept, pairs in concept_groups.items():
|
140 |
+
if pairs: # Only show concepts with matches
|
141 |
+
output_html += f"<h5>{concept.capitalize()}:</h5>"
|
142 |
+
for i, (sent1, sent2, score) in enumerate(pairs):
|
143 |
+
output_html += f"""
|
144 |
+
<div style="background-color: #f0f0f0; padding: 10px; margin: 5px; border-radius: 5px;">
|
145 |
+
<p><b>Document 1:</b> {sent1}</p>
|
146 |
+
<p><b>Document 2:</b> {sent2}</p>
|
147 |
+
<p><b>Similarity:</b> {score:.2%}</p>
|
148 |
+
</div>
|
149 |
+
"""
|
150 |
+
else:
|
151 |
+
output_html += "<p>No significant similarities found above the threshold.</p>"
|
152 |
+
|
153 |
+
# Generate similarity heatmap if there are sentences
|
154 |
+
sentences1 = preprocess_text(text1)
|
155 |
+
sentences2 = preprocess_text(text2)
|
156 |
+
|
157 |
+
if sentences1 and sentences2:
|
158 |
+
# Get embeddings for visualization
|
159 |
+
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
160 |
+
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
|
161 |
+
similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2).cpu().numpy()
|
162 |
+
|
163 |
+
# Generate and save heatmap
|
164 |
+
heatmap_path = visualize_similarity(sentences1, sentences2, similarity_matrix)
|
165 |
+
output_html += f'<h4>Similarity Heatmap:</h4><img src="/file={heatmap_path}" alt="Similarity Heatmap" style="max-width: 100%;">'
|
166 |
+
|
167 |
+
return output_html
|
168 |
|
169 |
+
# Create a Gradio interface with enhanced features
|
170 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
171 |
+
gr.Markdown("""
|
172 |
+
# Document Similarity Checker with Detailed Analysis
|
173 |
+
Upload two documents to compare their content and identify specific similarities.
|
174 |
+
""")
|
175 |
+
|
176 |
with gr.Row():
|
177 |
+
with gr.Column():
|
178 |
+
file1 = gr.File(label="Upload Document 1", file_types=[".pdf", ".docx"])
|
179 |
+
file2 = gr.File(label="Upload Document 2", file_types=[".pdf", ".docx"])
|
180 |
+
submit = gr.Button("Compare Documents", variant="primary")
|
181 |
+
|
182 |
+
with gr.Column():
|
183 |
+
output = gr.HTML(label="Similarity Analysis Results")
|
184 |
+
|
185 |
+
# Add examples for users to try
|
186 |
+
gr.Examples(
|
187 |
+
examples=[
|
188 |
+
[os.path.join(os.path.dirname(__file__), "sample1.pdf"), os.path.join(os.path.dirname(__file__), "sample2.pdf")],
|
189 |
+
[os.path.join(os.path.dirname(__file__), "sample1.docx"), os.path.join(os.path.dirname(__file__), "sample2.docx")]
|
190 |
+
],
|
191 |
+
inputs=[file1, file2],
|
192 |
+
outputs=output,
|
193 |
+
fn=similarity,
|
194 |
+
cache_examples=False
|
195 |
+
)
|
196 |
+
|
197 |
+
submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
|
198 |
|
199 |
+
# Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
|
200 |
port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
|
201 |
+
|
202 |
+
if __name__ == "__main__":
|
203 |
+
demo.launch(server_name="0.0.0.0", server_port=port)
|