Commit
·
e9536a9
1
Parent(s):
1822f54
Added score block number
Browse files
app.py
CHANGED
@@ -25,7 +25,9 @@ with gr.Blocks() as demo:
|
|
25 |
|
26 |
with gr.Row():
|
27 |
with gr.Column():
|
|
|
28 |
submit_button_mining = gr.Button("Submit", variant="primary")
|
|
|
29 |
with gr.Row():
|
30 |
with gr.Column():
|
31 |
output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
@@ -33,7 +35,7 @@ with gr.Blocks() as demo:
|
|
33 |
|
34 |
submit_button_mining.click(
|
35 |
fn=mining,
|
36 |
-
inputs=upload_button_sentences,
|
37 |
outputs=output_mining
|
38 |
)
|
39 |
|
@@ -71,6 +73,7 @@ with gr.Blocks() as demo:
|
|
71 |
|
72 |
with gr.Row():
|
73 |
with gr.Column():
|
|
|
74 |
submit_button_sts = gr.Button("Submit", variant="primary")
|
75 |
|
76 |
with gr.Row():
|
@@ -82,7 +85,7 @@ with gr.Blocks() as demo:
|
|
82 |
|
83 |
submit_button_sts.click(
|
84 |
fn=sts,
|
85 |
-
inputs=[upload_button_sentences1, upload_button_sentences2],
|
86 |
outputs=output_sts
|
87 |
)
|
88 |
|
|
|
25 |
|
26 |
with gr.Row():
|
27 |
with gr.Column():
|
28 |
+
score_mining = gr.Number(label="score", value=0.96, interactive=True)
|
29 |
submit_button_mining = gr.Button("Submit", variant="primary")
|
30 |
+
|
31 |
with gr.Row():
|
32 |
with gr.Column():
|
33 |
output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
|
|
35 |
|
36 |
submit_button_mining.click(
|
37 |
fn=mining,
|
38 |
+
inputs=[upload_button_sentences, score_mining],
|
39 |
outputs=output_mining
|
40 |
)
|
41 |
|
|
|
73 |
|
74 |
with gr.Row():
|
75 |
with gr.Column():
|
76 |
+
score_sts = gr.Number(label="score", value=0.96, interactive=True)
|
77 |
submit_button_sts = gr.Button("Submit", variant="primary")
|
78 |
|
79 |
with gr.Row():
|
|
|
85 |
|
86 |
submit_button_sts.click(
|
87 |
fn=sts,
|
88 |
+
inputs=[upload_button_sentences1, upload_button_sentences2, score_sts],
|
89 |
outputs=output_sts
|
90 |
)
|
91 |
|
mining.py
CHANGED
@@ -6,9 +6,9 @@ from datasets import Dataset
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from sentence_transformers.util import paraphrase_mining
|
8 |
|
9 |
-
def mining(path):
|
10 |
st = time.time()
|
11 |
-
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"]))
|
12 |
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
model = SentenceTransformer(
|
@@ -38,7 +38,7 @@ def mining(path):
|
|
38 |
pl.col("score").round(3).cast(pl.Float32),
|
39 |
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
|
40 |
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
|
41 |
-
]).filter(pl.col("score") >
|
42 |
|
43 |
elapsed_time = time.time() - st
|
44 |
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
|
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from sentence_transformers.util import paraphrase_mining
|
8 |
|
9 |
+
def mining(path, score):
|
10 |
st = time.time()
|
11 |
+
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
|
12 |
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
model = SentenceTransformer(
|
|
|
38 |
pl.col("score").round(3).cast(pl.Float32),
|
39 |
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
|
40 |
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
|
41 |
+
]).filter(pl.col("score") > score).sort(["score"], descending=True)
|
42 |
|
43 |
elapsed_time = time.time() - st
|
44 |
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
|
sts.py
CHANGED
@@ -5,7 +5,7 @@ import torch
|
|
5 |
from datasets import Dataset
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
|
8 |
-
def sts(data1, data2):
|
9 |
st = time.time()
|
10 |
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -17,8 +17,8 @@ def sts(data1, data2):
|
|
17 |
trust_remote_code=True,
|
18 |
)
|
19 |
|
20 |
-
sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"]))
|
21 |
-
sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"]))
|
22 |
|
23 |
embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
|
24 |
show_progress_bar=True)
|
@@ -52,5 +52,5 @@ def sts(data1, data2):
|
|
52 |
elapsed_time = time.time() - st
|
53 |
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
|
54 |
|
55 |
-
return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") >
|
56 |
descending=True)
|
|
|
5 |
from datasets import Dataset
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
|
8 |
+
def sts(data1, data2, score):
|
9 |
st = time.time()
|
10 |
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
17 |
trust_remote_code=True,
|
18 |
)
|
19 |
|
20 |
+
sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
|
21 |
+
sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
|
22 |
|
23 |
embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
|
24 |
show_progress_bar=True)
|
|
|
52 |
elapsed_time = time.time() - st
|
53 |
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
|
54 |
|
55 |
+
return df_long.select(["score", "setences1", "sentences2"]).filter(pl.col("score") > score).sort(["score"],
|
56 |
descending=True)
|
utils.py
CHANGED
@@ -6,7 +6,7 @@ import polars as pl
|
|
6 |
import time
|
7 |
|
8 |
def getDataFrame(path):
|
9 |
-
data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"])
|
10 |
return pl.from_pandas(data)
|
11 |
|
12 |
def save_to_csv(dataframe):
|
|
|
6 |
import time
|
7 |
|
8 |
def getDataFrame(path):
|
9 |
+
data = pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t")
|
10 |
return pl.from_pandas(data)
|
11 |
|
12 |
def save_to_csv(dataframe):
|