albertmartinez commited on
Commit
222cf81
·
1 Parent(s): 77196ea

update mining

Browse files
Files changed (2) hide show
  1. app.py +4 -1
  2. mining.py +17 -6
app.py CHANGED
@@ -25,6 +25,9 @@ with gr.Blocks() as demo:
25
 
26
  with gr.Row():
27
  with gr.Column():
 
 
 
28
  score_mining = gr.Number(label="score", value=0.96, interactive=True)
29
  submit_button_mining = gr.Button("Submit", variant="primary")
30
 
@@ -35,7 +38,7 @@ with gr.Blocks() as demo:
35
 
36
  submit_button_mining.click(
37
  fn=mining,
38
- inputs=[upload_button_sentences, score_mining],
39
  outputs=output_mining
40
  )
41
 
 
25
 
26
  with gr.Row():
27
  with gr.Column():
28
+ model = gr.Dropdown(
29
+ ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
30
+ "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
31
  score_mining = gr.Number(label="score", value=0.96, interactive=True)
32
  submit_button_mining = gr.Button("Submit", variant="primary")
33
 
 
38
 
39
  submit_button_mining.click(
40
  fn=mining,
41
+ inputs=[model, upload_button_sentences, score_mining],
42
  outputs=output_mining
43
  )
44
 
mining.py CHANGED
@@ -6,15 +6,15 @@ from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
  from sentence_transformers.util import paraphrase_mining
8
 
9
- def mining(path, score):
 
10
  st = time.time()
11
- data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
 
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  model = SentenceTransformer(
15
- "sentence-transformers/all-MiniLM-L6-v2",
16
- backend="openvino",
17
- model_kwargs={"file_name": "openvino/openvino_model.xml"},
18
  device=device,
19
  trust_remote_code=True,
20
  )
@@ -34,13 +34,24 @@ def mining(path, score):
34
 
35
  union_df = pl.DataFrame(data.to_pandas())
36
 
 
 
 
 
 
 
 
 
 
 
37
  df = df.with_columns([
38
  pl.col("score").round(3).cast(pl.Float32),
39
  union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
40
  union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
 
41
  ]).filter(pl.col("score") > score).sort(["score"], descending=True)
42
 
43
  elapsed_time = time.time() - st
44
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
45
 
46
- return df
 
6
  from sentence_transformers import SentenceTransformer
7
  from sentence_transformers.util import paraphrase_mining
8
 
9
+
10
+ def mining(modelname, path, score):
11
  st = time.time()
12
+ data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
13
+ original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
14
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model = SentenceTransformer(
17
+ modelname,
 
 
18
  device=device,
19
  trust_remote_code=True,
20
  )
 
34
 
35
  union_df = pl.DataFrame(data.to_pandas())
36
 
37
+ original_columns = original_df.columns.tolist()
38
+
39
+ additional_cols = []
40
+ for col in original_columns:
41
+ if col != "text":
42
+ additional_cols.extend([
43
+ union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
44
+ union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
45
+ ])
46
+
47
  df = df.with_columns([
48
  pl.col("score").round(3).cast(pl.Float32),
49
  union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
50
  union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
51
+ *additional_cols
52
  ]).filter(pl.col("score") > score).sort(["score"], descending=True)
53
 
54
  elapsed_time = time.time() - st
55
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
56
 
57
+ return df