albertmartinez commited on
Commit
e9536a9
·
1 Parent(s): 1822f54

Added score block number

Browse files
Files changed (4) hide show
  1. app.py +5 -2
  2. mining.py +3 -3
  3. sts.py +4 -4
  4. utils.py +1 -1
app.py CHANGED
@@ -25,7 +25,9 @@ with gr.Blocks() as demo:
25
 
26
  with gr.Row():
27
  with gr.Column():
 
28
  submit_button_mining = gr.Button("Submit", variant="primary")
 
29
  with gr.Row():
30
  with gr.Column():
31
  output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
@@ -33,7 +35,7 @@ with gr.Blocks() as demo:
33
 
34
  submit_button_mining.click(
35
  fn=mining,
36
- inputs=upload_button_sentences,
37
  outputs=output_mining
38
  )
39
 
@@ -71,6 +73,7 @@ with gr.Blocks() as demo:
71
 
72
  with gr.Row():
73
  with gr.Column():
 
74
  submit_button_sts = gr.Button("Submit", variant="primary")
75
 
76
  with gr.Row():
@@ -82,7 +85,7 @@ with gr.Blocks() as demo:
82
 
83
  submit_button_sts.click(
84
  fn=sts,
85
- inputs=[upload_button_sentences1, upload_button_sentences2],
86
  outputs=output_sts
87
  )
88
 
 
25
 
26
  with gr.Row():
27
  with gr.Column():
28
+ score_mining = gr.Number(label="score", value=0.96, interactive=True)
29
  submit_button_mining = gr.Button("Submit", variant="primary")
30
+
31
  with gr.Row():
32
  with gr.Column():
33
  output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
 
35
 
36
  submit_button_mining.click(
37
  fn=mining,
38
+ inputs=[upload_button_sentences, score_mining],
39
  outputs=output_mining
40
  )
41
 
 
73
 
74
  with gr.Row():
75
  with gr.Column():
76
+ score_sts = gr.Number(label="score", value=0.96, interactive=True)
77
  submit_button_sts = gr.Button("Submit", variant="primary")
78
 
79
  with gr.Row():
 
85
 
86
  submit_button_sts.click(
87
  fn=sts,
88
+ inputs=[upload_button_sentences1, upload_button_sentences2, score_sts],
89
  outputs=output_sts
90
  )
91
 
mining.py CHANGED
@@ -6,9 +6,9 @@ from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
  from sentence_transformers.util import paraphrase_mining
8
 
9
- def mining(path):
10
  st = time.time()
11
- data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"]))
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  model = SentenceTransformer(
@@ -38,7 +38,7 @@ def mining(path):
38
  pl.col("score").round(3).cast(pl.Float32),
39
  union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
40
  union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
41
- ]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)
42
 
43
  elapsed_time = time.time() - st
44
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
 
6
  from sentence_transformers import SentenceTransformer
7
  from sentence_transformers.util import paraphrase_mining
8
 
9
+ def mining(path, score):
10
  st = time.time()
11
+ data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  model = SentenceTransformer(
 
38
  pl.col("score").round(3).cast(pl.Float32),
39
  union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
40
  union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
41
+ ]).filter(pl.col("score") > score).sort(["score"], descending=True)
42
 
43
  elapsed_time = time.time() - st
44
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
sts.py CHANGED
@@ -5,7 +5,7 @@ import torch
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
 
8
- def sts(data1, data2):
9
  st = time.time()
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -17,8 +17,8 @@ def sts(data1, data2):
17
  trust_remote_code=True,
18
  )
19
 
20
- sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"]))
21
- sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"]))
22
 
23
  embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
24
  show_progress_bar=True)
@@ -52,5 +52,5 @@ def sts(data1, data2):
52
  elapsed_time = time.time() - st
53
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
54
 
55
- return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > 0.96).sort(["score"],
56
  descending=True)
 
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
 
8
+ def sts(data1, data2, score):
9
  st = time.time()
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
17
  trust_remote_code=True,
18
  )
19
 
20
+ sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
21
+ sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
22
 
23
  embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
24
  show_progress_bar=True)
 
52
  elapsed_time = time.time() - st
53
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
54
 
55
+ return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > score).sort(["score"],
56
  descending=True)
utils.py CHANGED
@@ -6,7 +6,7 @@ import polars as pl
6
  import time
7
 
8
  def getDataFrame(path):
9
- data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"])
10
  return pl.from_pandas(data)
11
 
12
  def save_to_csv(dataframe):
 
6
  import time
7
 
8
  def getDataFrame(path):
9
+ data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t")
10
  return pl.from_pandas(data)
11
 
12
  def save_to_csv(dataframe):