Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from A_Preprocess import load_pdf_data | |
from E_Model_utils import get_embeddings | |
import numpy as np | |
import faiss | |
# Load and preprocess data | |
data_file_path = r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents_Cleaned.csv' | |
data = load_pdf_data(data_file_path) | |
sentences = data['utterance'].tolist() | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
model_name = 'all-MiniLM-L6-v2' | |
embeddings = get_embeddings(model, sentences) | |
print(f'Embeddings shape: {embeddings.shape}.') | |
#save embeddings as faiss index | |
# Convert embeddings to float32 | |
embeddings = np.array(embeddings).astype('float32') | |
# Create a FAISS index | |
index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance | |
index.add(embeddings) | |
# Save the FAISS index | |
faiss.write_index(index, f"{model_name}_faiss.index") | |
# Load the FAISS index (for later use) | |
index = faiss.read_index(f"{model_name}_faiss.index") | |
# To query the index, you can use the search method | |
# Example: Find the 5 nearest neighbors of a query embedding | |
query_embedding = 'cat am de platit la factura' | |
query_embedding = np.array([embeddings[0]]).astype('float32') # Example query | |
D, I = index.search(query_embedding, 5) # D: distances, I: indices | |
print("Indices of nearest neighbors:", I) | |
print("Distances of nearest neighbors:", D) | |
#print(embeddings[:10]) |