Spaces:
Paused
Paused
File size: 1,936 Bytes
3430cbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from sentence_transformers import SentenceTransformer, util
from langchain.docstore.document import Document
import numpy as np
from config import *
import os
os.environ['CURL_CA_BUNDLE'] = ""
embedding_int = HuggingFaceBgeEmbeddings(
model_name=MODEL_NAME,
encode_kwargs=ENCODE_KWARGS,
query_instruction=QUERY_INSTRUCTION
)
embedding_sim = HuggingFaceBgeEmbeddings(
model_name=MODEL_NAME,
encode_kwargs=ENCODE_KWARGS,
query_instruction='Retrieve semantically similar text.'
)
db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_int)
retriever = db.as_retriever(search_kwargs={"k": TOP_K})
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
# Der Inhalt von page_content wird embedded und so für die sucher verwendet.
docs = []
for index, beruf in berufe.iterrows():
# Create document.
doc = Document(
page_content= beruf['short name'] + ' ' + beruf['full name'] + ' ' + beruf['description'],
metadata={
"id": beruf["id"],
"name": beruf['short name'],
"description": beruf["description"],
"entry_requirements": beruf["entry requirements"]
},
)
docs.append(doc)
db_temp = Chroma.from_documents(documents = docs, embedding= embedding_sim, collection_metadata = {"hnsw:space": similarity_func})
# Retriever will search for the top_5 most similar documents to the query.
retriever_temp = db_temp.as_retriever(search_kwargs={"k": top_k})
top_similar_occupations = retriever_temp.get_relevant_documents(target_occupation_query)
return top_similar_occupations |