Spaces:
Sleeping
Sleeping
File size: 2,184 Bytes
4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 4db208a 8514dc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_text_splitters import CharacterTextSplitter
from llm.gemini import Gemini
from utils.questions_parser import parse_question
class Retriever:
_model = Gemini()
def __init__(self):
if "DATA_PATH" not in os.environ:
raise ValueError("DATA_PATH environment variable is not set")
DATA_PATH = os.environ["DATA_PATH"]
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs = text_splitter.split_documents(data_loader)
self.vectorstore = Chroma.from_documents(
docs, self._model.embeddings, persist_directory="./chroma_db"
)
self.metadata_field_info = [
AttributeInfo(
name="topico",
description="A materia escolar da qual a questão pertence.",
type="string",
),
AttributeInfo(
name="assunto",
description="O assunto da materia fornecida anteriormente.",
type="string",
),
AttributeInfo(
name="dificuldade",
description="O nivel de dificuldade para resolver a questao.",
type="string",
),
AttributeInfo(
name="tipo",
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa",
type="string",
),
]
document_content_description = "Questões de matérias do ensino médio."
db = Chroma.from_documents(docs, self._model.embeddings)
self.retriever = SelfQueryRetriever.from_llm(
self._model.llm,
self.vectorstore,
document_content_description,
self.metadata_field_info,
verbose=True,
)
self.docs_retriever = db.as_retriever()
|