Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,48 +17,10 @@ nlp.add_pipe('sentencizer')
|
|
17 |
#model = BertModel.from_pretrained(bert_model_name)
|
18 |
#model.eval()
|
19 |
|
20 |
-
import torch
|
21 |
-
import numpy as np
|
22 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
-
|
24 |
-
def get_bert_embeddings(texts):
|
25 |
-
"""Obtain BERT embeddings for a list of texts."""
|
26 |
-
embeddings = []
|
27 |
-
with torch.no_grad():
|
28 |
-
for text in texts:
|
29 |
-
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
|
30 |
-
outputs = model(**inputs)
|
31 |
-
# Take the mean of token embeddings as the sentence embedding
|
32 |
-
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
33 |
-
embeddings.append(embedding)
|
34 |
-
return np.array(embeddings)
|
35 |
-
|
36 |
-
def compute_similarity(embedding1, embeddings2):
|
37 |
-
"""Compute cosine similarity between a single embedding and a set of embeddings."""
|
38 |
-
return cosine_similarity([embedding1], embeddings2)[0]
|
39 |
-
|
40 |
-
def compare_paragraph_to_list(paragraph, paragraph_list, top_n=3):
|
41 |
-
"""Compare a single paragraph to a list of paragraphs and return the top N most similar ones."""
|
42 |
-
# Get embedding for the target paragraph
|
43 |
-
target_embedding = get_bert_embeddings([paragraph])[0] # Only one paragraph
|
44 |
-
|
45 |
-
# Get embeddings for the list of paragraphs
|
46 |
-
list_embeddings = get_bert_embeddings(paragraph_list)
|
47 |
-
|
48 |
-
# Compute similarity between the target and each paragraph in the list
|
49 |
-
similarity_scores = compute_similarity(target_embedding, list_embeddings)
|
50 |
-
|
51 |
-
# Combine paragraphs with their similarity scores
|
52 |
-
results = [
|
53 |
-
{'compared_paragraph': paragraph_list[i], 'similarity_score': similarity_scores[i]}
|
54 |
-
for i in range(len(paragraph_list))
|
55 |
-
]
|
56 |
-
|
57 |
-
# Sort the results by similarity score in descending order and take the top N
|
58 |
-
sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)[:top_n]
|
59 |
-
|
60 |
-
# Return only the top N most similar paragraphs
|
61 |
-
return sorted_results
|
62 |
|
63 |
|
64 |
|
|
|
17 |
#model = BertModel.from_pretrained(bert_model_name)
|
18 |
#model.eval()
|
19 |
|
20 |
+
#import torch
|
21 |
+
#import numpy as np
|
22 |
+
#from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
|