albertmartinez commited on
Commit
77196ea
·
1 Parent(s): d513e74

update sts

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +4 -1
  3. sts.py +9 -10
  4. utils.py +1 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🏢
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.11.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.23.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -73,6 +73,9 @@ with gr.Blocks() as demo:
73
 
74
  with gr.Row():
75
  with gr.Column():
 
 
 
76
  score_sts = gr.Number(label="score", value=0.96, interactive=True)
77
  submit_button_sts = gr.Button("Submit", variant="primary")
78
 
@@ -85,7 +88,7 @@ with gr.Blocks() as demo:
85
 
86
  submit_button_sts.click(
87
  fn=sts,
88
- inputs=[upload_button_sentences1, upload_button_sentences2, score_sts],
89
  outputs=output_sts
90
  )
91
 
 
73
 
74
  with gr.Row():
75
  with gr.Column():
76
+ model = gr.Dropdown(
77
+ ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
78
+ "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
79
  score_sts = gr.Number(label="score", value=0.96, interactive=True)
80
  submit_button_sts = gr.Button("Submit", variant="primary")
81
 
 
88
 
89
  submit_button_sts.click(
90
  fn=sts,
91
+ inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
92
  outputs=output_sts
93
  )
94
 
sts.py CHANGED
@@ -5,20 +5,19 @@ import torch
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
 
8
- def sts(data1, data2, score):
 
9
  st = time.time()
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  model = SentenceTransformer(
13
- "sentence-transformers/all-MiniLM-L6-v2",
14
- backend="openvino",
15
- model_kwargs={"file_name": "openvino/openvino_model.xml"},
16
  device=device,
17
  trust_remote_code=True,
18
  )
19
 
20
- sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
21
- sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
22
 
23
  embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
24
  show_progress_bar=True)
@@ -45,12 +44,12 @@ def sts(data1, data2, score):
45
  .join(df_sentences2, on="column_index"))
46
 
47
  df_long = df_long.rename({
48
- "text": "setences1",
49
  "text_right": "sentences2",
50
- })
51
 
52
  elapsed_time = time.time() - st
53
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
54
 
55
- return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > score).sort(["score"],
56
- descending=True)
 
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
 
8
+
9
+ def sts(modelname, data1, data2, score):
10
  st = time.time()
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  model = SentenceTransformer(
14
+ modelname,
 
 
15
  device=device,
16
  trust_remote_code=True,
17
  )
18
 
19
+ sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
20
+ sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
21
 
22
  embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
23
  show_progress_bar=True)
 
44
  .join(df_sentences2, on="column_index"))
45
 
46
  df_long = df_long.rename({
47
+ "text": "sentences1",
48
  "text_right": "sentences2",
49
+ }).drop(["row_index", "column_index"])
50
 
51
  elapsed_time = time.time() - st
52
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
53
 
54
+ return df_long.filter(pl.col("score") > score).sort(["score"],
55
+ descending=True)
utils.py CHANGED
@@ -6,7 +6,7 @@ import polars as pl
6
  import time
7
 
8
  def getDataFrame(path):
9
- data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t")
10
  return pl.from_pandas(data)
11
 
12
  def save_to_csv(dataframe):
 
6
  import time
7
 
8
  def getDataFrame(path):
9
+ data = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
10
  return pl.from_pandas(data)
11
 
12
  def save_to_csv(dataframe):