Phi3-Mini-ONXX / pre_processing.py
SwastikM's picture
Upload 16 files
83686b4 verified
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
check_point = 'nomic-ai/nomic-embed-text-v1'
embedding_model = SentenceTransformer(check_point,trust_remote_code=True)
def parese_doc(doc,first_section,ignore_after):
documents_1 = ''
reader = doc
for page in reader.pages:
documents_1 += page.extract_text()
cleaned_string = documents_1.replace('\n', ' ')
cleaned_string = cleaned_string.lower()
start_index = cleaned_string.find(first_section)
end_index = cleaned_string.rfind(ignore_after)
if start_index!=-1 and end_index!=-1:
cleaned_string = cleaned_string[start_index:end_index]
sentence_list = cleaned_string.split('. ')
context_list = []
group_size = 20
overlap = 5
i = 0
while True:
group = sentence_list[i:i+group_size]
text = '. '.join(group)
context_list.append(text)
i+=group_size-overlap
if i>=len(sentence_list):
break
return context_list
def get_embeddings(doc):
model_input = doc
out = embedding_model.encode(model_input)
return out
def create_embedding(context_list):
embedding_dimension = embedding_model.get_sentence_embedding_dimension()
embeddings = list(map(get_embeddings,context_list))
embeddings_array = np.array(embeddings)
index = faiss.IndexFlatL2(embedding_dimension)
index.add(embeddings_array)
return index