|
import os |
|
import streamlit as st |
|
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings |
|
from llama_index.llms.gemini import Gemini |
|
from llama_index.core.extractors import TitleExtractor |
|
from llama_index.core.node_parser import SentenceWindowNodeParser |
|
from llama_index.core.ingestion import IngestionPipeline |
|
from llama_index.core.query_engine import RetrieverQueryEngine |
|
from llama_index.core.retrievers import AutoMergingRetriever |
|
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever |
|
from llama_index.vector_stores.chroma import ChromaVectorStore |
|
|
|
import chromadb |
|
|
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
|
|
Settings.llm = Gemini(api_key=os.environ["GOOGLE_API_KEY"], temperature=0.1) |
|
Settings.chunk_size = 1024 |
|
|
|
|
|
|
|
@st.cache_resource |
|
def load_data_and_create_index(): |
|
"""Charge les documents PDF et crée l'index vectoriel.""" |
|
documents = SimpleDirectoryReader("./data").load_data() |
|
|
|
|
|
node_parser = SentenceWindowNodeParser.from_defaults( |
|
window_size=3, |
|
window_metadata_key="window", |
|
original_text_metadata_key="original_text", |
|
) |
|
|
|
text_splitter = node_parser.get_leaf_nodes_and_parent_nodes |
|
extractors = [TitleExtractor(nodes=5)] |
|
|
|
pipeline = IngestionPipeline( |
|
transformations=[node_parser, *extractors] |
|
) |
|
|
|
|
|
nodes = pipeline.run(documents=documents) |
|
|
|
|
|
db = chromadb.Client() |
|
chroma_collection = db.get_or_create_collection("legal_docs") |
|
vector_store = ChromaVectorStore(chroma_collection=chroma_collection) |
|
|
|
|
|
index = VectorStoreIndex.from_documents(nodes, vector_store=vector_store) |
|
return index |
|
|
|
|
|
def perform_query(query_str, index): |
|
"""Effectue une requête sur l'index et renvoie la réponse.""" |
|
|
|
base_retriever = VectorIndexRetriever( |
|
index=index, |
|
similarity_top_k=8, |
|
) |
|
retriever = AutoMergingRetriever(base_retriever, index.storage_context) |
|
|
|
|
|
query_engine = RetrieverQueryEngine.from_args(retriever=retriever) |
|
|
|
response = query_engine.query(query_str) |
|
return response |
|
|
|
|
|
st.title("Agent de Questions-Réponses Juridiques (Gemini + LlamaIndex)") |
|
|
|
|
|
index = load_data_and_create_index() |
|
|
|
|
|
query_str = st.text_input("Posez votre question juridique ici :") |
|
|
|
|
|
if st.button("Poser la question"): |
|
if query_str: |
|
with st.spinner("Recherche en cours..."): |
|
response = perform_query(query_str, index) |
|
st.success("Réponse :") |
|
st.write(response) |
|
else: |
|
st.error("Veuillez saisir une question.") |