Spaces:
Running
Running
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
import faiss | |
check_point = 'nomic-ai/nomic-embed-text-v1' | |
embedding_model = SentenceTransformer(check_point,trust_remote_code=True) | |
def parese_doc(doc,first_section,ignore_after): | |
documents_1 = '' | |
reader = doc | |
for page in reader.pages: | |
documents_1 += page.extract_text() | |
cleaned_string = documents_1.replace('\n', ' ') | |
cleaned_string = cleaned_string.lower() | |
start_index = cleaned_string.find(first_section) | |
end_index = cleaned_string.rfind(ignore_after) | |
if start_index!=-1 and end_index!=-1: | |
cleaned_string = cleaned_string[start_index:end_index] | |
sentence_list = cleaned_string.split('. ') | |
context_list = [] | |
group_size = 20 | |
overlap = 5 | |
i = 0 | |
while True: | |
group = sentence_list[i:i+group_size] | |
text = '. '.join(group) | |
context_list.append(text) | |
i+=group_size-overlap | |
if i>=len(sentence_list): | |
break | |
return context_list | |
def get_embeddings(doc): | |
model_input = doc | |
out = embedding_model.encode(model_input) | |
return out | |
def create_embedding(context_list): | |
embedding_dimension = embedding_model.get_sentence_embedding_dimension() | |
embeddings = list(map(get_embeddings,context_list)) | |
embeddings_array = np.array(embeddings) | |
index = faiss.IndexFlatL2(embedding_dimension) | |
index.add(embeddings_array) | |
return index | |