Spaces:
Sleeping
Sleeping
import os | |
from langchain_community.document_loaders import TextLoader | |
from langchain.vectorstores import Chroma | |
from langchain.chains.query_constructor.base import AttributeInfo | |
from langchain.retrievers.self_query.base import SelfQueryRetriever | |
from langchain_text_splitters import CharacterTextSplitter | |
from llm.gemini import Gemini | |
from utils.questions_parser import parse_question | |
class Retriever: | |
_model = Gemini() | |
def __init__(self): | |
if "DATA_PATH" not in os.environ: | |
raise ValueError("DATA_PATH environment variable is not set") | |
DATA_PATH = os.environ["DATA_PATH"] | |
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load() | |
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) | |
docs = text_splitter.split_documents(data_loader) | |
self.vectorstore = Chroma.from_documents( | |
docs, self._model.embeddings, persist_directory="./chroma_db" | |
) | |
self.metadata_field_info = [ | |
AttributeInfo( | |
name="topico", | |
description="A materia escolar da qual a questão pertence.", | |
type="string", | |
), | |
AttributeInfo( | |
name="assunto", | |
description="O assunto da materia fornecida anteriormente.", | |
type="string", | |
), | |
AttributeInfo( | |
name="dificuldade", | |
description="O nivel de dificuldade para resolver a questao.", | |
type="string", | |
), | |
AttributeInfo( | |
name="tipo", | |
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa", | |
type="string", | |
), | |
] | |
document_content_description = "Questões de matérias do ensino médio." | |
db = Chroma.from_documents(docs, self._model.embeddings) | |
self.retriever = SelfQueryRetriever.from_llm( | |
self._model.llm, | |
self.vectorstore, | |
document_content_description, | |
self.metadata_field_info, | |
verbose=True, | |
) | |
self.docs_retriever = db.as_retriever() | |