Spaces:

tyang
/

simcse-mpnet-fuzz-tfidf

Runtime error

App Files Files Community

tyang commited on Dec 28, 2021

Commit

bc62de0

1 Parent(s): 6b08394

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -14

app.py CHANGED Viewed

@@ -2,31 +2,61 @@ import torch
 from scipy.spatial.distance import cosine
 from transformers import AutoModel, AutoTokenizer
 from thefuzz import fuzz
 import gradio as gr
-tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
-model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
 def thefuzz(text1, text2):
     score = fuzz.token_sort_ratio(text1, text2)
-    return {'token sort ratio':score/100}
 def simcse(text1, text2):
-    # Tokenize input texts
     texts = [text1,text2]
-    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-    # Get the embeddings
     with torch.no_grad():
-        embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
-    cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
-    return {"cosine similarity of simcse embeddings":cosine_sim_0_1}
 def get_scores(text1, text2):
     fuzz_out = thefuzz(text1, text2)
     simcse_out = simcse(text1, text2)
-    return simcse_out, fuzz_out
 inputs = [
           gr.inputs.Textbox(lines=5, label="Input Text One"),
@@ -34,11 +64,13 @@ inputs = [
 ]
 outputs = [
             gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
             gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
 ]
-title = "SimCSE vs thefuzz"
-description = "Simple app for comparing text similarity scores using Princeton-NLP SimCSE and thefuzz from SeatGeek. Interface by Troy Yang."
-article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
 examples = [
     ["There's a kid on a skateboard.","A kid is skateboarding."],
     ['There is no boy standing in front of the blue building in the space reserved for handicapped people',	'A boy is standing in front of the blue building in the space reserved for handicapped people'],
@@ -61,4 +93,6 @@ examples = [
 ,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
  'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
 ]
-gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article, examples=examples).launch(share=True)#()#

 from scipy.spatial.distance import cosine
 from transformers import AutoModel, AutoTokenizer
 from thefuzz import fuzz
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
+tokenizer_simcse = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
+model_simcse = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
+tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
+model_mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
+vectorizer = TfidfVectorizer()
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 def thefuzz(text1, text2):
     score = fuzz.token_sort_ratio(text1, text2)
+    return {'levenshtein distance of sorted tokens':score/100}
+def tfidf(text1, text2):
+    t1_tfidf = vectorizer.fit_transform([text1])
+    t2_tfidf = vectorizer.transform([text2])
+    cosine_sim = cosine_similarity(t1_tfidf, t2_tfidf).flatten()[0]
+    return {'cosine similarity of tf-idf vectors':cosine_sim}
 def simcse(text1, text2):
     texts = [text1,text2]
+    inputs = tokenizer_simcse(texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
+        embeddings = model_simcse(**inputs, output_hidden_states=True, return_dict=True).pooler_output
+    cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
+    return {"cosine similarity of simcse embeddings":cosine_sim}
+def mpnet(text1, text2):
+    encoded_input = tokenizer_mpnet([text1,text2], padding=True, truncation=True, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model_mpnet(**encoded_input)
+    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    cosine_sim = 1 - cosine(sentence_embeddings[0], sentence_embeddings[1])
+    return {"cosine similarity of stsb-mpnet embeddings":cosine_sim}
 def get_scores(text1, text2):
     fuzz_out = thefuzz(text1, text2)
+    tfidf_out = tfidf(text1, text2)
     simcse_out = simcse(text1, text2)
+    mpnet_out = mpnet(text1, text2)
+    return simcse_out, mpnet_out, fuzz_out, tfidf_out
 inputs = [
           gr.inputs.Textbox(lines=5, label="Input Text One"),
 ]
 outputs = [
             gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
+            gr.outputs.Label(type="confidences",label="Cosine similarity based on stsb-mpnet embeddings"),
             gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
+            gr.outputs.Label(type="confidences",label="Cosine similarity based on tf-idf vectors"),
 ]
+title = "SimCSE vs MPNet vs thefuzz vs TF-IDF"
+description = "Demo for comparing semantic text similarity methods. Princeton-NLP SimCSE, stsb-mpnet-base-v2 from sentence-transformers (MPnet from Microsoft as the backbone), thefuzz from SeatGeek, and TF-IDF. Interface by Troy Yang."
+article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://huggingface.co/sentence-transformers/stsb-mpnet-base-v2'>stsb-mpnet-base-v2 HuggingFace model card</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
 examples = [
     ["There's a kid on a skateboard.","A kid is skateboarding."],
     ['There is no boy standing in front of the blue building in the space reserved for handicapped people',	'A boy is standing in front of the blue building in the space reserved for handicapped people'],
 ,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
  'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
 ]
+gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article,
+ theme="darkdefault", examples=examples, flagging_options=["strongly related","related", "neutral", "unrelated", "stongly unrelated"]).launch(share=True,enable_queue=True)#()#