albertmartinez's picture
update mining
222cf81
import time
import pandas as pd
import polars as pl
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
def mining(modelname, path, score):
st = time.time()
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(
modelname,
device=device,
trust_remote_code=True,
)
paraphrases = paraphrase_mining(
model,
data["text"],
corpus_chunk_size=len(data),
show_progress_bar=True,
batch_size=1024,
max_pairs=len(data) ** 2,
)
df_pd = pd.DataFrame(paraphrases)
df = pl.from_pandas(df_pd)
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
union_df = pl.DataFrame(data.to_pandas())
original_columns = original_df.columns.tolist()
additional_cols = []
for col in original_columns:
if col != "text":
additional_cols.extend([
union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
])
df = df.with_columns([
pl.col("score").round(3).cast(pl.Float32),
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
*additional_cols
]).filter(pl.col("score") > score).sort(["score"], descending=True)
elapsed_time = time.time() - st
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
return df