In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

#ignore warinings   
import warnings
warnings.filterwarnings("ignore")

def filter_similar_sentences(model: SentenceTransformer, df: pd.DataFrame) -> pd.DataFrame:
    # Calculate embeddings for each utterance
    embeddings = model.encode(df['utterance'].tolist(), convert_to_tensor=True)
    
    # Calculate cosine similarity matrix
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    
    # Keep track of sentences to keep
    to_keep = set()
    
    for i in range(len(df)):
        if i not in to_keep:
            to_keep.add(i)
            for j in range(i + 1, len(df)):
                if cosine_scores[i][j] < 0.5:
                    print(f"Similarity between '{df.iloc[i]['utterance']}' and '{df.iloc[j]['utterance']}' is {cosine_scores[i][j]:.2f}")
                    to_keep.add(j)
    print()
    print(f"Kept {len(to_keep)} out of {len(df)} sentences")
    print(to_keep)


    # Filter the dataframe to keep only the selected sentences
    filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
    
    return filtered_df

    



  from tqdm.autonotebook import tqdm, trange


In [6]:

def filter_similar_sentences_transformers(model: SentenceTransformer, df: pd.DataFrame, intent_values: list , threshold: float = 0.5) -> pd.DataFrame:
        # Calculate embeddings for each utterance

        intents_df = df.copy()[df['intent'].isin(intent_values)].reset_index(drop=True)
        utterances_list = intents_df['utterance'].tolist()
        embeddings = model.encode(utterances_list, convert_to_tensor=True)
        
        # Calculate cosine similarity matrix
        similarities = model.similarity(embeddings, embeddings)
        too_similar_df = pd.DataFrame(columns=['utterance', 'intent'])
        too_similar_rows = []
        
        # Keep track of sentences to keep
        indexes_to_keep = intents_df.copy().index.values.tolist()
        print(f'Indexes: {indexes_to_keep}')
        for i in range(len(indexes_to_keep)):
            for j in range(i + 1, len(indexes_to_keep)):
                if similarities[i][j] >= threshold:
                    print(f"Similarity between '{intents_df.iloc[i]['utterance']}' and '{intents_df.iloc[j]['utterance']: <50}' is {similarities[i][j]:.2f}'")
                    if j in indexes_to_keep:
                        # append the intents_df.iloc[j] row to too_similar_df df
                        too_similar_rows.append(intents_df.iloc[j])


                        print(f"Removing '{intents_df.iloc[j]['utterance']}'")
                        #print('J', j)   
                        #print(f'Indexes: {indexes_to_keep}')
                        indexes_to_keep.remove(j)
        
        # Filter the dataframe to keep only the selected sentences
        filtered_df = intents_df.iloc[indexes_to_keep].reset_index(drop=True)

        # Create a new dataframe with the removed sentences
        too_similar_df = pd.DataFrame(too_similar_rows).reset_index(drop=True)
        
        return filtered_df, too_similar_df


### Filter Intent Real dataframe !!!

In [7]:
# 
directory_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data'
file_name = 'InvoiceDetailsExplanation.csv'
file_path = directory_path + '\\' + file_name

# Load the data
utterances_df = pd.read_csv(file_path)
print(f'Length of the dataframe: {len(utterances_df)}')
print()
# Load the model multilingual-e5-small from sentence-transformers
# 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer('BlackKakapo/stsb-xlm-r-multilingual-ro')

#
columns = ['invoice.Explanation']
# Filter similar sentences
filtered_df, removed_df = filter_similar_sentences_transformers(model, utterances_df, columns, 0.7)

print(f'Length of the filtered dataframe: {len(filtered_df)}')
# Display the filtered dataframe
filtered_df.head()

#column name = columns.join('_')
# Save the filtered dataframe
#filtered_file_name = f'{columns}'InvoiceDetailsExplanation12.csv'
filtered_file_name = 'GoldenIntent'

column_name = '_'.join(columns).replace('.','')
save_path = directory_path + '\\' + filtered_file_name + '_' + column_name + '.csv'
filtered_df.to_csv(save_path, index=False)

# Save the removed dataframe
removed_file_name = 'RemovedIntents'
removed_path = directory_path + '\\' + removed_file_name + '_' + column_name + '.csv'
removed_df.to_csv(removed_path, index=False)



Length of the dataframe: 752

Indexes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 

In [316]:
utterances_df['intent'].value_counts()

intent
tshooting                    1024
reziliere                     809
offer.Request                 489
switch.To.Agent               399
invoice.Explanation           220
                             ... 
info.faq.Nr.WhatsApp            2
info.faq.VTV.login              2
info.faq.Cable.Connection       1
info.faq.Port.Usage             1
info.faq.Netflix                1
Name: count, Length: 104, dtype: int64

In [317]:
filtered_utterances['intent'].value_counts()

intent
invoice.PC         107
invoice.Details     43
Name: count, dtype: int64

### Other models

#### Alibaba


#### e5

In [None]:
intfloat/multilingual-e5-large

### Run the entire dataset

### Examples