Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: | |
# Calculate embeddings for each utterance | |
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) | |
# Calculate cosine similarity matrix | |
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) | |
# Keep track of sentences to keep | |
to_keep = set() | |
for i in range(len(df)): | |
if i not in to_keep: | |
to_keep.add(i) | |
for j in range(i + 1, len(df)): | |
if cosine_scores[i][j] >= 0.8: | |
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") | |
to_keep.add(j) | |
# Filter the dataframe to keep only the selected sentences | |
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) | |
return filtered_df | |
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: | |
# Get embeddings for user input | |
user_embedding = model.encode(user_text, convert_to_tensor=True) | |
# Get embeddings for all utterances | |
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) | |
# Calculate cosine similarity between user input and all utterances | |
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] | |
# Get top_n most similar utterances | |
top_matches = cosine_scores.argsort(descending=True)[:top_n] | |
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] | |
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' | |
# Load the data | |
utterances = pd.read_csv(file_path) | |
# Load the model multilingual-e5-small from sentence-transformers | |
# 'sentence-transformers/all-MiniLM-L6-v2' | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
# Filter similar sentences | |
#filtered_utterances = filter_similar_sentences(model, utterances) | |
# Display the filtered dataframe | |
#filtered_utterances.head() | |
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] | |
def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame: | |
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) | |
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) | |
to_keep = set() | |
for i in range(len(df)): | |
if i not in to_keep: | |
to_keep.add(i) | |
for j in range(i + 1, len(df)): | |
if cosine_scores[i][j] >= 0.8: | |
print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}") | |
to_keep.add(j) | |
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True) | |
return filtered_df | |
def get_similar_sentences(model: SentenceTransformer, user_text: str, df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: | |
user_embedding = model.encode(user_text, convert_to_tensor=True) | |
embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True) | |
cosine_scores = util.pytorch_cos_sim(user_embedding, embeddings)[0] | |
top_matches = cosine_scores.argsort(descending=True)[:top_n] | |
return df.iloc[top_matches][['utterance', 'intent', 'combined', 'similarity']] | |
file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\InvoiceDetailsExplanation.csv' | |
utterances = pd.read_csv(file_path) | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
examples = ['vreau detalii despre ultima factura','cat pot sa platesc','informatii factura','vreau informatii despre costuri','as dori sa aflu ultima factura'] | |
for example in examples: | |
print(f"Input: {example}") | |
print(similar_sentences) | |
print("\n") |