NLP / utils /vector_base.py
maximka608
test
b93b2dc
raw
history blame contribute delete
878 Bytes
import faiss, json
from script.preprocessing_text import Preprocessor
from rank_bm25 import BM25Okapi
import numpy as np
class KnowledgeBase:
def __init__(self, faiss_path, preprocessing_path) -> None:
self.BM25_model = BM25Okapi(self._load(preprocessing_path))
self.vector_base = faiss.read_index(faiss_path)
def _load(self, path):
with open(path, 'rb') as file:
data = json.load(file)
return data
def search_by_BM25(self, query, k=5):
preprocessor = Preprocessor()
prep_query = preprocessor.preprocessing_text(query)
doc_scores = self.BM25_model.get_scores(prep_query)
sorted_docs = np.argsort(-doc_scores)
return sorted_docs[:k].tolist()
def search_by_embedding(self, embedding, k):
_, indexes = self.vector_base.search(embedding, k)
return indexes