Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -59,90 +59,61 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
|
|
59 |
return None, None
|
60 |
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
"\033[41m", # Background Red
|
67 |
-
"\033[42m", # Background Green
|
68 |
-
"\033[43m", # Background Yellow
|
69 |
-
"\033[44m", # Background Blue
|
70 |
-
"\033[45m", # Background Purple
|
71 |
-
"\033[46m", # Background Cyan
|
72 |
-
"\033[100m", # Background Dark Gray
|
73 |
-
"\033[101m", # Background Light Red
|
74 |
-
"\033[102m", # Background Light Green
|
75 |
-
"\033[103m", # Background Light Yellow
|
76 |
-
"\033[104m", # Background Light Blue
|
77 |
-
"\033[105m", # Background Light Purple
|
78 |
-
"\033[106m", # Background Light Cyan
|
79 |
-
"\033[47m" # Background Gray
|
80 |
-
]
|
81 |
-
html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
|
82 |
-
color_index = 0
|
83 |
-
|
84 |
-
highlighted_sentences = []
|
85 |
-
for sentence in sentences:
|
86 |
-
words = word_tokenize(sentence)
|
87 |
-
other_sentences = [s for s in sentences if s != sentence]
|
88 |
-
all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
|
89 |
-
|
90 |
-
highlighted_words = []
|
91 |
-
for word in words:
|
92 |
-
if word.lower() not in exclude_words and word.isalnum():
|
93 |
-
match_index, similarity = calculate_similarity(word, all_other_words, model)
|
94 |
-
if match_index is not None:
|
95 |
-
# Assign color to the word if not already assigned
|
96 |
-
if word not in word_to_color:
|
97 |
-
word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
|
98 |
-
color_index += 1
|
99 |
-
# Highlight the word
|
100 |
-
#highlighted_word = f"{word_to_color[word]}{word}\033[0m"
|
101 |
-
highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
|
102 |
-
else:
|
103 |
-
highlighted_word = word
|
104 |
-
highlighted_words.append(highlighted_word)
|
105 |
-
else:
|
106 |
-
highlighted_words.append(word)
|
107 |
-
|
108 |
-
highlighted_sentences.append(' '.join(highlighted_words))
|
109 |
-
return highlighted_sentences
|
110 |
-
|
111 |
-
# Rest of the code, including the cluster_sentences function, remains the same
|
112 |
-
|
113 |
-
exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
|
114 |
-
|
115 |
-
def cluster_sentences(sentences, model, num_clusters=3):
|
116 |
-
embeddings = model.encode(sentences)
|
117 |
-
kmeans = KMeans(n_clusters=num_clusters)
|
118 |
-
kmeans.fit(embeddings)
|
119 |
-
return kmeans.labels_
|
120 |
-
|
121 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
122 |
-
exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
|
123 |
-
|
124 |
-
sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
|
125 |
-
|
126 |
-
"Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
|
127 |
-
|
128 |
-
"A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
|
129 |
-
|
130 |
-
"In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
|
131 |
-
|
132 |
-
# Step 1: Cluster the sentences
|
133 |
-
num_clusters = 1
|
134 |
-
sentence_clusters = cluster_sentences(sentences, model, num_clusters)
|
135 |
-
|
136 |
-
# Step 2: Highlight similar words within each cluster
|
137 |
-
clustered_sentences = [[] for _ in range(num_clusters)]
|
138 |
-
for sentence, cluster_id in zip(sentences, sentence_clusters):
|
139 |
-
clustered_sentences[cluster_id].append(sentence)
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
|
|
146 |
def calculate_similarity_score(sentences):
|
147 |
# Encode all sentences to get their embeddings
|
148 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
@@ -233,36 +204,20 @@ def updateChoices(prompt):
|
|
233 |
return gr.CheckboxGroup(choices=newChoices)
|
234 |
|
235 |
def setTextVisibility(cbg, model_name_input):
|
236 |
-
sentences = []
|
237 |
-
result = []
|
238 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
239 |
-
exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
|
240 |
-
sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
|
241 |
-
"Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
|
242 |
-
"A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
|
243 |
-
"In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
|
244 |
-
for text in cbg:
|
245 |
-
sentences.append(answer_question(text, model_name_input))
|
246 |
-
|
247 |
-
# Step 1: Cluster the sentences
|
248 |
-
num_clusters = 1
|
249 |
-
sentence_clusters = cluster_sentences(sentences, model, num_clusters)
|
250 |
-
|
251 |
-
# Step 2: Highlight similar words within each cluster
|
252 |
-
clustered_sentences = [[] for _ in range(num_clusters)]
|
253 |
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
256 |
|
257 |
-
highlighted_clustered_sentences = []
|
258 |
-
|
259 |
-
for cluster in clustered_sentences:
|
260 |
-
highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
|
261 |
|
262 |
-
|
|
|
|
|
263 |
result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
|
264 |
|
265 |
-
score = round(calculate_similarity_score(
|
266 |
|
267 |
final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
|
268 |
|
|
|
59 |
return None, None
|
60 |
|
61 |
|
62 |
+
from sentence_transformers import SentenceTransformer, util
|
63 |
+
import nltk
|
64 |
+
nltk.download('punkt') # Ensure you have the punkt tokenizer models
|
65 |
+
from nltk import tokenize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
|
68 |
+
# Load a pre-trained sentence-transformer model
|
69 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
70 |
+
|
71 |
+
# Split each paragraph into sentences
|
72 |
+
all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
|
73 |
+
flattened_sentences = [sentence for sublist in all_sentences for sentence in sublist] # Flatten the list
|
74 |
+
|
75 |
+
# Encode all sentences into vectors
|
76 |
+
sentence_embeddings = model.encode(flattened_sentences)
|
77 |
|
78 |
+
# Calculate cosine similarities between sentence vectors
|
79 |
+
cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
|
80 |
+
|
81 |
+
# A list of colors for highlighting, add more if needed
|
82 |
+
colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
|
83 |
+
|
84 |
+
# Initialize a list to keep track of which sentences are semantically similar
|
85 |
+
highlighted_sentences = [''] * len(flattened_sentences) # Pre-fill with empty strings
|
86 |
+
|
87 |
+
# Iterate over the matrix to find sentences with high cosine similarity
|
88 |
+
color_index = 0 # Initialize color index
|
89 |
+
for i in range(len(cosine_similarities)):
|
90 |
+
for j in range(i + 1, len(cosine_similarities)):
|
91 |
+
if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
|
92 |
+
# Select color for highlighting
|
93 |
+
color = colors[color_index % len(colors)]
|
94 |
+
color_index += 1 # Move to the next color
|
95 |
+
|
96 |
+
# Highlight the similar sentences
|
97 |
+
highlighted_sentences[i] = ("<span style='color: "+ color +"'>"+ flattened_sentences[i]+"</span>")
|
98 |
+
highlighted_sentences[j] = ("<span style='color: "+ color +"'>"+ flattened_sentences[j]+"</span>")
|
99 |
+
|
100 |
+
# Reconstruct the paragraphs with highlighted sentences
|
101 |
+
highlighted_paragraphs = []
|
102 |
+
sentence_index = 0
|
103 |
+
for paragraph_sentences in all_sentences:
|
104 |
+
highlighted_paragraph = ''
|
105 |
+
for _ in paragraph_sentences:
|
106 |
+
# Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
|
107 |
+
highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
|
108 |
+
highlighted_paragraph += highlighted_sentence + ' '
|
109 |
+
sentence_index += 1
|
110 |
+
highlighted_paragraphs.append(highlighted_paragraph)
|
111 |
+
|
112 |
+
# Combine all paragraphs into one HTML string
|
113 |
+
html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
|
114 |
+
return highlighted_paragraphs
|
115 |
|
116 |
+
|
117 |
def calculate_similarity_score(sentences):
|
118 |
# Encode all sentences to get their embeddings
|
119 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
204 |
return gr.CheckboxGroup(choices=newChoices)
|
205 |
|
206 |
def setTextVisibility(cbg, model_name_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
+
sentences = [answer_question(text, model_name_input) for text in cbg]
|
209 |
+
|
210 |
+
# Apply highlighting to all processed sentences, receiving one complete HTML string.
|
211 |
+
highlighted_html = []
|
212 |
+
highlighted_html = highlight_similar_paragraphs_with_colors(sentences, similarity_threshold=0.05)
|
213 |
|
|
|
|
|
|
|
|
|
214 |
|
215 |
+
result = []
|
216 |
+
# Iterate through each original 'cbg' sentence and pair it with the entire highlighted block.
|
217 |
+
for idx, sentence in enumerate(highlighted_html):
|
218 |
result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
|
219 |
|
220 |
+
score = round(calculate_similarity_score(highlighted_html))
|
221 |
|
222 |
final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
|
223 |
|