import time import pandas as pd import polars as pl import torch from datasets import Dataset from sentence_transformers import SentenceTransformer from sentence_transformers.util import paraphrase_mining def mining(modelname, path, score): st = time.time() data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")) original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t") device = "cuda" if torch.cuda.is_available() else "cpu" model = SentenceTransformer( modelname, device=device, trust_remote_code=True, ) paraphrases = paraphrase_mining( model, data["text"], corpus_chunk_size=len(data), show_progress_bar=True, batch_size=1024, max_pairs=len(data) ** 2, ) df_pd = pd.DataFrame(paraphrases) df = pl.from_pandas(df_pd) df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"}) union_df = pl.DataFrame(data.to_pandas()) original_columns = original_df.columns.tolist() additional_cols = [] for col in original_columns: if col != "text": additional_cols.extend([ union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"), union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2") ]) df = df.with_columns([ pl.col("score").round(3).cast(pl.Float32), union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"), union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"), *additional_cols ]).filter(pl.col("score") > score).sort(["score"], descending=True) elapsed_time = time.time() - st print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) return df