Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,31 +2,61 @@ import torch
|
|
2 |
from scipy.spatial.distance import cosine
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
from thefuzz import fuzz
|
|
|
|
|
5 |
import gradio as gr
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
def thefuzz(text1, text2):
|
12 |
score = fuzz.token_sort_ratio(text1, text2)
|
13 |
-
return {'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
def simcse(text1, text2):
|
17 |
-
# Tokenize input texts
|
18 |
texts = [text1,text2]
|
19 |
-
inputs =
|
20 |
-
# Get the embeddings
|
21 |
with torch.no_grad():
|
22 |
-
embeddings =
|
23 |
-
|
24 |
-
return {"cosine similarity of simcse embeddings":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def get_scores(text1, text2):
|
27 |
fuzz_out = thefuzz(text1, text2)
|
|
|
28 |
simcse_out = simcse(text1, text2)
|
29 |
-
|
|
|
|
|
30 |
|
31 |
inputs = [
|
32 |
gr.inputs.Textbox(lines=5, label="Input Text One"),
|
@@ -34,11 +64,13 @@ inputs = [
|
|
34 |
]
|
35 |
outputs = [
|
36 |
gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
|
|
|
37 |
gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
|
|
|
38 |
]
|
39 |
-
title = "SimCSE vs thefuzz"
|
40 |
-
description = "
|
41 |
-
article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
|
42 |
examples = [
|
43 |
["There's a kid on a skateboard.","A kid is skateboarding."],
|
44 |
['There is no boy standing in front of the blue building in the space reserved for handicapped people', 'A boy is standing in front of the blue building in the space reserved for handicapped people'],
|
@@ -61,4 +93,6 @@ examples = [
|
|
61 |
,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
|
62 |
'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
|
63 |
]
|
64 |
-
|
|
|
|
|
|
2 |
from scipy.spatial.distance import cosine
|
3 |
from transformers import AutoModel, AutoTokenizer
|
4 |
from thefuzz import fuzz
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
import gradio as gr
|
8 |
|
9 |
+
|
10 |
+
tokenizer_simcse = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
|
11 |
+
model_simcse = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
|
12 |
+
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
|
13 |
+
model_mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
|
14 |
+
vectorizer = TfidfVectorizer()
|
15 |
+
|
16 |
+
|
17 |
+
def mean_pooling(model_output, attention_mask):
|
18 |
+
token_embeddings = model_output[0]
|
19 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
20 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
21 |
|
22 |
|
23 |
def thefuzz(text1, text2):
|
24 |
score = fuzz.token_sort_ratio(text1, text2)
|
25 |
+
return {'levenshtein distance of sorted tokens':score/100}
|
26 |
+
|
27 |
+
|
28 |
+
def tfidf(text1, text2):
|
29 |
+
t1_tfidf = vectorizer.fit_transform([text1])
|
30 |
+
t2_tfidf = vectorizer.transform([text2])
|
31 |
+
cosine_sim = cosine_similarity(t1_tfidf, t2_tfidf).flatten()[0]
|
32 |
+
return {'cosine similarity of tf-idf vectors':cosine_sim}
|
33 |
|
34 |
|
35 |
def simcse(text1, text2):
|
|
|
36 |
texts = [text1,text2]
|
37 |
+
inputs = tokenizer_simcse(texts, padding=True, truncation=True, return_tensors="pt")
|
|
|
38 |
with torch.no_grad():
|
39 |
+
embeddings = model_simcse(**inputs, output_hidden_states=True, return_dict=True).pooler_output
|
40 |
+
cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
|
41 |
+
return {"cosine similarity of simcse embeddings":cosine_sim}
|
42 |
+
|
43 |
+
|
44 |
+
def mpnet(text1, text2):
|
45 |
+
encoded_input = tokenizer_mpnet([text1,text2], padding=True, truncation=True, return_tensors='pt')
|
46 |
+
with torch.no_grad():
|
47 |
+
model_output = model_mpnet(**encoded_input)
|
48 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
49 |
+
cosine_sim = 1 - cosine(sentence_embeddings[0], sentence_embeddings[1])
|
50 |
+
return {"cosine similarity of stsb-mpnet embeddings":cosine_sim}
|
51 |
+
|
52 |
|
53 |
def get_scores(text1, text2):
|
54 |
fuzz_out = thefuzz(text1, text2)
|
55 |
+
tfidf_out = tfidf(text1, text2)
|
56 |
simcse_out = simcse(text1, text2)
|
57 |
+
mpnet_out = mpnet(text1, text2)
|
58 |
+
|
59 |
+
return simcse_out, mpnet_out, fuzz_out, tfidf_out
|
60 |
|
61 |
inputs = [
|
62 |
gr.inputs.Textbox(lines=5, label="Input Text One"),
|
|
|
64 |
]
|
65 |
outputs = [
|
66 |
gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
|
67 |
+
gr.outputs.Label(type="confidences",label="Cosine similarity based on stsb-mpnet embeddings"),
|
68 |
gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
|
69 |
+
gr.outputs.Label(type="confidences",label="Cosine similarity based on tf-idf vectors"),
|
70 |
]
|
71 |
+
title = "SimCSE vs MPNet vs thefuzz vs TF-IDF"
|
72 |
+
description = "Demo for comparing semantic text similarity methods. Princeton-NLP SimCSE, stsb-mpnet-base-v2 from sentence-transformers (MPnet from Microsoft as the backbone), thefuzz from SeatGeek, and TF-IDF. Interface by Troy Yang."
|
73 |
+
article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://huggingface.co/sentence-transformers/stsb-mpnet-base-v2'>stsb-mpnet-base-v2 HuggingFace model card</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
|
74 |
examples = [
|
75 |
["There's a kid on a skateboard.","A kid is skateboarding."],
|
76 |
['There is no boy standing in front of the blue building in the space reserved for handicapped people', 'A boy is standing in front of the blue building in the space reserved for handicapped people'],
|
|
|
93 |
,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
|
94 |
'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
|
95 |
]
|
96 |
+
|
97 |
+
gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article,
|
98 |
+
theme="darkdefault", examples=examples, flagging_options=["strongly related","related", "neutral", "unrelated", "stongly unrelated"]).launch(share=True,enable_queue=True)#()#
|