Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -61,50 +61,40 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
|
|
61 |
|
62 |
|
63 |
def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
|
64 |
-
# Load a pre-trained sentence-transformer model
|
65 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
66 |
|
67 |
# Split each paragraph into sentences
|
68 |
all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
# Encode all sentences into vectors
|
|
|
72 |
sentence_embeddings = model.encode(flattened_sentences)
|
73 |
|
74 |
-
# Calculate cosine similarities between
|
75 |
cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
|
76 |
|
77 |
-
#
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
# Iterate over the matrix to find sentences with high cosine similarity
|
84 |
-
color_index = 0 # Initialize color index
|
85 |
-
for i in range(len(cosine_similarities)):
|
86 |
-
for j in range(i + 1, len(cosine_similarities)):
|
87 |
-
if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
|
88 |
-
# Select color for highlighting
|
89 |
color = colors[color_index % len(colors)]
|
|
|
|
|
|
|
|
|
90 |
color_index += 1 # Move to the next color
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
# Reconstruct the paragraphs with highlighted sentences
|
97 |
-
highlighted_paragraphs = []
|
98 |
-
sentence_index = 0
|
99 |
-
for paragraph_sentences in all_sentences:
|
100 |
-
highlighted_paragraph = ''
|
101 |
-
for _ in paragraph_sentences:
|
102 |
-
# Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
|
103 |
-
highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
|
104 |
-
highlighted_paragraph += highlighted_sentence + ' '
|
105 |
-
sentence_index += 1
|
106 |
-
highlighted_paragraphs.append(highlighted_paragraph)
|
107 |
-
|
108 |
# Combine all paragraphs into one HTML string
|
109 |
html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
|
110 |
return highlighted_paragraphs
|
|
|
61 |
|
62 |
|
63 |
def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
|
|
|
64 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
65 |
|
66 |
# Split each paragraph into sentences
|
67 |
all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
|
68 |
+
|
69 |
+
# Initialize storage for highlighted sentences
|
70 |
+
highlighted_sentences = [['' for sentence in para] for para in all_sentences]
|
71 |
+
colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
|
72 |
+
|
73 |
+
# Track which sentences belong to which paragraph
|
74 |
+
sentence_to_paragraph_index = [idx for idx, para in enumerate(all_sentences) for sentence in para]
|
75 |
|
76 |
# Encode all sentences into vectors
|
77 |
+
flattened_sentences = [sentence for para in all_sentences for sentence in para]
|
78 |
sentence_embeddings = model.encode(flattened_sentences)
|
79 |
|
80 |
+
# Calculate cosine similarities between all pairs of sentences
|
81 |
cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
|
82 |
|
83 |
+
# Iterate through each sentence pair and highlight if they are similar but from different paragraphs
|
84 |
+
color_index = 0
|
85 |
+
for i, embedding_i in enumerate(sentence_embeddings):
|
86 |
+
for j, embedding_j in enumerate(sentence_embeddings):
|
87 |
+
if i != j and cosine_similarities[i, j] > similarity_threshold and sentence_to_paragraph_index[i] != sentence_to_paragraph_index[j]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
color = colors[color_index % len(colors)]
|
89 |
+
if highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] == '':
|
90 |
+
highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] = ("<span style='color: "+ color +"'>"+ flattened_sentences[i]+"</span>")
|
91 |
+
if highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] == '':
|
92 |
+
highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] = ("<span style='color: "+ color +"'>"+ flattened_sentences[j]+"</span>")
|
93 |
color_index += 1 # Move to the next color
|
94 |
+
|
95 |
+
# Combine sentences back into paragraphs
|
96 |
+
highlighted_paragraphs = [' '.join(para) for para in highlighted_sentences]
|
97 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
# Combine all paragraphs into one HTML string
|
99 |
html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
|
100 |
return highlighted_paragraphs
|