tyang commited on
Commit
bc62de0
·
1 Parent(s): 6b08394

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -14
app.py CHANGED
@@ -2,31 +2,61 @@ import torch
2
  from scipy.spatial.distance import cosine
3
  from transformers import AutoModel, AutoTokenizer
4
  from thefuzz import fuzz
 
 
5
  import gradio as gr
6
 
7
- tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
8
- model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def thefuzz(text1, text2):
12
  score = fuzz.token_sort_ratio(text1, text2)
13
- return {'token sort ratio':score/100}
 
 
 
 
 
 
 
14
 
15
 
16
  def simcse(text1, text2):
17
- # Tokenize input texts
18
  texts = [text1,text2]
19
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
20
- # Get the embeddings
21
  with torch.no_grad():
22
- embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
23
- cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
24
- return {"cosine similarity of simcse embeddings":cosine_sim_0_1}
 
 
 
 
 
 
 
 
 
 
25
 
26
  def get_scores(text1, text2):
27
  fuzz_out = thefuzz(text1, text2)
 
28
  simcse_out = simcse(text1, text2)
29
- return simcse_out, fuzz_out
 
 
30
 
31
  inputs = [
32
  gr.inputs.Textbox(lines=5, label="Input Text One"),
@@ -34,11 +64,13 @@ inputs = [
34
  ]
35
  outputs = [
36
  gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
 
37
  gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
 
38
  ]
39
- title = "SimCSE vs thefuzz"
40
- description = "Simple app for comparing text similarity scores using Princeton-NLP SimCSE and thefuzz from SeatGeek. Interface by Troy Yang."
41
- article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
42
  examples = [
43
  ["There's a kid on a skateboard.","A kid is skateboarding."],
44
  ['There is no boy standing in front of the blue building in the space reserved for handicapped people', 'A boy is standing in front of the blue building in the space reserved for handicapped people'],
@@ -61,4 +93,6 @@ examples = [
61
  ,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
62
  'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
63
  ]
64
- gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article, examples=examples).launch(share=True)#()#
 
 
 
2
  from scipy.spatial.distance import cosine
3
  from transformers import AutoModel, AutoTokenizer
4
  from thefuzz import fuzz
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
  import gradio as gr
8
 
9
+
10
+ tokenizer_simcse = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
11
+ model_simcse = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
12
+ tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
13
+ model_mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
14
+ vectorizer = TfidfVectorizer()
15
+
16
+
17
+ def mean_pooling(model_output, attention_mask):
18
+ token_embeddings = model_output[0]
19
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
20
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
21
 
22
 
23
  def thefuzz(text1, text2):
24
  score = fuzz.token_sort_ratio(text1, text2)
25
+ return {'levenshtein distance of sorted tokens':score/100}
26
+
27
+
28
+ def tfidf(text1, text2):
29
+ t1_tfidf = vectorizer.fit_transform([text1])
30
+ t2_tfidf = vectorizer.transform([text2])
31
+ cosine_sim = cosine_similarity(t1_tfidf, t2_tfidf).flatten()[0]
32
+ return {'cosine similarity of tf-idf vectors':cosine_sim}
33
 
34
 
35
  def simcse(text1, text2):
 
36
  texts = [text1,text2]
37
+ inputs = tokenizer_simcse(texts, padding=True, truncation=True, return_tensors="pt")
 
38
  with torch.no_grad():
39
+ embeddings = model_simcse(**inputs, output_hidden_states=True, return_dict=True).pooler_output
40
+ cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
41
+ return {"cosine similarity of simcse embeddings":cosine_sim}
42
+
43
+
44
+ def mpnet(text1, text2):
45
+ encoded_input = tokenizer_mpnet([text1,text2], padding=True, truncation=True, return_tensors='pt')
46
+ with torch.no_grad():
47
+ model_output = model_mpnet(**encoded_input)
48
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
49
+ cosine_sim = 1 - cosine(sentence_embeddings[0], sentence_embeddings[1])
50
+ return {"cosine similarity of stsb-mpnet embeddings":cosine_sim}
51
+
52
 
53
  def get_scores(text1, text2):
54
  fuzz_out = thefuzz(text1, text2)
55
+ tfidf_out = tfidf(text1, text2)
56
  simcse_out = simcse(text1, text2)
57
+ mpnet_out = mpnet(text1, text2)
58
+
59
+ return simcse_out, mpnet_out, fuzz_out, tfidf_out
60
 
61
  inputs = [
62
  gr.inputs.Textbox(lines=5, label="Input Text One"),
 
64
  ]
65
  outputs = [
66
  gr.outputs.Label(type="confidences",label="Cosine similarity based on SimCSE embeddings"),
67
+ gr.outputs.Label(type="confidences",label="Cosine similarity based on stsb-mpnet embeddings"),
68
  gr.outputs.Label(type="confidences",label="Token sort ratio using Levenshtein distance"),
69
+ gr.outputs.Label(type="confidences",label="Cosine similarity based on tf-idf vectors"),
70
  ]
71
+ title = "SimCSE vs MPNet vs thefuzz vs TF-IDF"
72
+ description = "Demo for comparing semantic text similarity methods. Princeton-NLP SimCSE, stsb-mpnet-base-v2 from sentence-transformers (MPnet from Microsoft as the backbone), thefuzz from SeatGeek, and TF-IDF. Interface by Troy Yang."
73
+ article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://huggingface.co/sentence-transformers/stsb-mpnet-base-v2'>stsb-mpnet-base-v2 HuggingFace model card</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
74
  examples = [
75
  ["There's a kid on a skateboard.","A kid is skateboarding."],
76
  ['There is no boy standing in front of the blue building in the space reserved for handicapped people', 'A boy is standing in front of the blue building in the space reserved for handicapped people'],
 
93
  ,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
94
  'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
95
  ]
96
+
97
+ gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article,
98
+ theme="darkdefault", examples=examples, flagging_options=["strongly related","related", "neutral", "unrelated", "stongly unrelated"]).launch(share=True,enable_queue=True)#()#