import os from langchain_community.document_loaders import TextLoader from langchain.vectorstores import Chroma from langchain.chains.query_constructor.base import AttributeInfo from langchain.retrievers.self_query.base import SelfQueryRetriever from llm.gemini import Gemini from utils.questions_parser import parse_question class Retriever: _model = Gemini() def __init__(self): if "DATA_PATH" not in os.environ: raise ValueError("DATA_PATH environment variable is not set") DATA_PATH = os.environ["DATA_PATH"] self.data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load() self.questions = list( map(lambda x: "##Questão" + x, self.data_loader[0].page_content.split("##Questão")) ) self.docs = [] for question in self.questions: try: self.docs.append(parse_question(question)) except Exception as e: print(e, question) self.vectorstore = Chroma.from_documents(self.docs, self._model.embeddings, persist_directory="./chroma_db") self.metadata_field_info = [ AttributeInfo( name="topico", description="A materia escolar da qual a questão pertence.", type="string", ), AttributeInfo( name="assunto", description="O assunto da materia fornecida anteriormente.", type="string", ), AttributeInfo( name="dificuldade", description="O nivel de dificuldade para resolver a questao.", type="string", ), AttributeInfo( name="tipo", description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa", type="string", ), ] document_content_description = "Questões de matérias do ensino médio." self.retriever = SelfQueryRetriever.from_llm( self._model.llm, self.vectorstore, document_content_description, self.metadata_field_info, verbose=True )