Cachoups commited on
Commit
985c608
·
verified ·
1 Parent(s): 5951118

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -42
app.py CHANGED
@@ -17,48 +17,10 @@ nlp.add_pipe('sentencizer')
17
  #model = BertModel.from_pretrained(bert_model_name)
18
  #model.eval()
19
 
20
- import torch
21
- import numpy as np
22
- from sklearn.metrics.pairwise import cosine_similarity
23
-
24
- def get_bert_embeddings(texts):
25
- """Obtain BERT embeddings for a list of texts."""
26
- embeddings = []
27
- with torch.no_grad():
28
- for text in texts:
29
- inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
30
- outputs = model(**inputs)
31
- # Take the mean of token embeddings as the sentence embedding
32
- embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
33
- embeddings.append(embedding)
34
- return np.array(embeddings)
35
-
36
- def compute_similarity(embedding1, embeddings2):
37
- """Compute cosine similarity between a single embedding and a set of embeddings."""
38
- return cosine_similarity([embedding1], embeddings2)[0]
39
-
40
- def compare_paragraph_to_list(paragraph, paragraph_list, top_n=3):
41
- """Compare a single paragraph to a list of paragraphs and return the top N most similar ones."""
42
- # Get embedding for the target paragraph
43
- target_embedding = get_bert_embeddings([paragraph])[0] # Only one paragraph
44
-
45
- # Get embeddings for the list of paragraphs
46
- list_embeddings = get_bert_embeddings(paragraph_list)
47
-
48
- # Compute similarity between the target and each paragraph in the list
49
- similarity_scores = compute_similarity(target_embedding, list_embeddings)
50
-
51
- # Combine paragraphs with their similarity scores
52
- results = [
53
- {'compared_paragraph': paragraph_list[i], 'similarity_score': similarity_scores[i]}
54
- for i in range(len(paragraph_list))
55
- ]
56
-
57
- # Sort the results by similarity score in descending order and take the top N
58
- sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)[:top_n]
59
-
60
- # Return only the top N most similar paragraphs
61
- return sorted_results
62
 
63
 
64
 
 
17
  #model = BertModel.from_pretrained(bert_model_name)
18
  #model.eval()
19
 
20
+ #import torch
21
+ #import numpy as np
22
+ #from sklearn.metrics.pairwise import cosine_similarity
23
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26