Spaces:
Running
Running
Melhorias no carregamento dos documentos
#4
by
leandroaraujodev
- opened
app.py
CHANGED
@@ -1,86 +1,43 @@
|
|
1 |
import logging
|
2 |
import sys
|
3 |
import os
|
|
|
4 |
import re
|
5 |
import base64
|
6 |
import nest_asyncio
|
|
|
7 |
import pandas as pd
|
8 |
from pathlib import Path
|
9 |
from typing import Any, Dict, List, Optional
|
10 |
from PIL import Image
|
11 |
import streamlit as st
|
12 |
import torch
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
from llama_index.core.storage.docstore import SimpleDocumentStore
|
15 |
-
# from llama_index.llms.ollama import Ollama
|
16 |
-
# from llama_index.embeddings.ollama import OllamaEmbedding
|
17 |
from llama_index.core.node_parser import LangchainNodeParser
|
18 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
19 |
from llama_index.core.storage.chat_store import SimpleChatStore
|
20 |
from llama_index.core.memory import ChatMemoryBuffer
|
21 |
from llama_index.core.query_engine import RetrieverQueryEngine
|
22 |
from llama_index.core.chat_engine import CondensePlusContextChatEngine
|
23 |
-
#from llama_index.retrievers.bm25 import BM25Retriever
|
24 |
from llama_index.core.retrievers import QueryFusionRetriever
|
25 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
26 |
from llama_index.core import VectorStoreIndex
|
27 |
-
|
28 |
-
# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
|
29 |
-
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
30 |
import chromadb
|
31 |
|
32 |
###############################################################################
|
33 |
# MONKEY PATCH EM bm25s #
|
34 |
###############################################################################
|
35 |
import bm25s
|
36 |
-
|
37 |
-
# Guardamos a referência da função original
|
38 |
-
orig_find_newline_positions = bm25s.utils.corpus.find_newline_positions
|
39 |
-
|
40 |
-
def patched_find_newline_positions(path, show_progress=True, leave_progress=True):
|
41 |
-
"""
|
42 |
-
Versão 'gambiarra' da função original, forçando uso de encoding='utf-8'
|
43 |
-
e ignorando erros de decodificação. Assim, evitamos UnicodeDecodeError
|
44 |
-
mesmo que o arquivo contenha caracteres fora da faixa UTF-8.
|
45 |
-
|
46 |
-
(Esta referência é real, baseada em ajustes de leitura de arquivos do Python.)
|
47 |
-
"""
|
48 |
-
path = str(path)
|
49 |
-
indexes = []
|
50 |
-
|
51 |
-
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
52 |
-
indexes.append(f.tell())
|
53 |
-
file_size = os.path.getsize(path)
|
54 |
-
|
55 |
-
try:
|
56 |
-
from tqdm.auto import tqdm
|
57 |
-
pbar = tqdm(
|
58 |
-
total=file_size,
|
59 |
-
desc="Finding newlines for mmindex",
|
60 |
-
unit="B",
|
61 |
-
unit_scale=True,
|
62 |
-
leave=leave_progress,
|
63 |
-
disable=not show_progress,
|
64 |
-
)
|
65 |
-
except ImportError:
|
66 |
-
pbar = None
|
67 |
-
|
68 |
-
while True:
|
69 |
-
line = f.readline()
|
70 |
-
if not line:
|
71 |
-
break
|
72 |
-
t = f.tell()
|
73 |
-
indexes.append(t)
|
74 |
-
if pbar is not None:
|
75 |
-
pbar.update(t - indexes[-2])
|
76 |
-
|
77 |
-
if pbar is not None:
|
78 |
-
pbar.close()
|
79 |
-
|
80 |
-
return indexes[:-1]
|
81 |
-
|
82 |
-
# Aplicamos nosso patch
|
83 |
-
bm25s.utils.corpus.find_newline_positions = patched_find_newline_positions
|
84 |
###############################################################################
|
85 |
# CLASSE BM25Retriever (AJUSTADA PARA ENCODING) #
|
86 |
###############################################################################
|
@@ -261,28 +218,21 @@ class BM25Retriever(BaseRetriever):
|
|
261 |
|
262 |
return nodes
|
263 |
|
264 |
-
#Configuração da imagem da aba
|
265 |
-
im = Image.open("pngegg.png")
|
266 |
-
st.set_page_config(page_title = "Chatbot Carômetro", page_icon=im, layout = "wide")
|
267 |
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
273 |
|
274 |
-
#
|
275 |
st.sidebar.title("Configuração de LLM")
|
276 |
sidebar_option = st.sidebar.radio("Selecione o LLM", ["gpt-3.5-turbo"])
|
277 |
|
278 |
-
# logo_url = 'app\logos\logo-sicoob.jpg'
|
279 |
-
# st.sidebar.image(logo_url)
|
280 |
import base64
|
281 |
-
|
282 |
-
#Configuração da imagem da sidebar
|
283 |
with open("sicoob-logo.png", "rb") as f:
|
284 |
data = base64.b64encode(f.read()).decode("utf-8")
|
285 |
-
|
286 |
st.sidebar.markdown(
|
287 |
f"""
|
288 |
<div style="display:table;margin-top:-80%;margin-left:0%;">
|
@@ -292,67 +242,24 @@ with open("sicoob-logo.png", "rb") as f:
|
|
292 |
unsafe_allow_html=True,
|
293 |
)
|
294 |
|
295 |
-
#if sidebar_option == "Ollama":
|
296 |
-
# Settings.llm = Ollama(model="llama3.2:latest", request_timeout=500.0, num_gpu=1)
|
297 |
-
# Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text:latest")
|
298 |
if sidebar_option == "gpt-3.5-turbo":
|
299 |
from llama_index.llms.openai import OpenAI
|
300 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
301 |
Settings.llm = OpenAI(model="gpt-3.5-turbo")
|
302 |
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
|
303 |
-
# elif sidebar_option == 'NuExtract-1.5':
|
304 |
-
# #Embedding do huggingface
|
305 |
-
# Settings.embed_model = HuggingFaceEmbedding(
|
306 |
-
# model_name="BAAI/bge-small-en-v1.5"
|
307 |
-
# )
|
308 |
-
# #Carregamento do modelo local, descomentar o modelo desejado
|
309 |
-
|
310 |
-
# llm = HuggingFaceLLM(
|
311 |
-
# context_window=2048,
|
312 |
-
# max_new_tokens=2048,
|
313 |
-
# generate_kwargs={"do_sample": False},
|
314 |
-
# #query_wrapper_prompt=query_wrapper_prompt,
|
315 |
-
# #model_name="Qwen/Qwen2.5-Coder-32B-Instruct",
|
316 |
-
# #model_name="Qwen/Qwen2.5-14B-Instruct",
|
317 |
-
# # model_name="meta-llama/Llama-3.2-3B",
|
318 |
-
# #model_name="HuggingFaceH4/zephyr-7b-beta",
|
319 |
-
# # model_name="meta-llama/Meta-Llama-3-8B",
|
320 |
-
# model_name="numind/NuExtract-1.5",
|
321 |
-
# #model_name="meta-llama/Llama-3.2-3B",
|
322 |
-
# tokenizer_name="numind/NuExtract-1.5",
|
323 |
-
# device_map="auto",
|
324 |
-
# tokenizer_kwargs={"max_length": 512},
|
325 |
-
# # uncomment this if using CUDA to reduce memory usage
|
326 |
-
# model_kwargs={"torch_dtype": torch.bfloat16},
|
327 |
-
# )
|
328 |
-
# chat = [
|
329 |
-
# {"role": "user", "content": "Hello, how are you?"},
|
330 |
-
# {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
|
331 |
-
# {"role": "user", "content": "I'd like to show off how chat templating works!"},
|
332 |
-
# ]
|
333 |
-
|
334 |
-
# from transformers import AutoTokenizer
|
335 |
-
|
336 |
-
# tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract-1.5")
|
337 |
-
# tokenizer.apply_chat_template(chat, tokenize=False)
|
338 |
-
|
339 |
-
# Settings.chunk_size = 512
|
340 |
-
# Settings.llm = llm
|
341 |
-
|
342 |
else:
|
343 |
raise Exception("Opção de LLM inválida!")
|
344 |
|
345 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
346 |
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
|
347 |
|
348 |
-
|
349 |
-
# Diretórios configurados pelo usuário
|
350 |
chat_store_path = os.path.join("chat_store", "chat_store.json")
|
351 |
-
documents_path =
|
352 |
-
chroma_storage_path =
|
353 |
-
bm25_persist_path =
|
354 |
|
355 |
-
# Classe CSV
|
356 |
class CustomPandasCSVReader:
|
357 |
"""PandasCSVReader modificado para incluir cabeçalhos nos documentos."""
|
358 |
def __init__(
|
@@ -394,15 +301,20 @@ class CustomPandasCSVReader:
|
|
394 |
for text in text_list
|
395 |
]
|
396 |
|
397 |
-
def clean_documents(documents):
|
398 |
-
"""Remove caracteres
|
399 |
-
|
400 |
for doc in documents:
|
401 |
cleaned_text = re.sub(r"[^0-9A-Za-zÀ-ÿ ]", "", doc.get_content())
|
402 |
doc.text = cleaned_text
|
403 |
-
|
404 |
-
return
|
405 |
|
|
|
|
|
|
|
|
|
|
|
406 |
from llama_index.readers.google import GoogleDriveReader
|
407 |
import json
|
408 |
|
@@ -424,10 +336,12 @@ with open(token_path, 'w') as credentials_file:
|
|
424 |
google_drive_reader = GoogleDriveReader(credentials_path=credentials_path)
|
425 |
google_drive_reader._creds = google_drive_reader._get_credentials()
|
426 |
|
427 |
-
def
|
428 |
-
|
429 |
-
|
430 |
-
|
|
|
|
|
431 |
os.makedirs(local_path, exist_ok=True)
|
432 |
files_meta = greader._get_fileids_meta(folder_id=pasta_documentos_drive)
|
433 |
if not files_meta:
|
@@ -448,114 +362,161 @@ def download_original_files_from_folder(greader: GoogleDriveReader, pasta_docume
|
|
448 |
else:
|
449 |
logging.warning(f"Não foi possível baixar '{file_name}'")
|
450 |
|
451 |
-
#
|
452 |
pasta_documentos_drive = "1s0UUANcU1B0D2eyRweb1W5idUn1V5JEh"
|
453 |
|
454 |
-
|
|
|
|
|
|
|
455 |
if not are_docs_downloaded(documents_path):
|
456 |
logging.info("Baixando arquivos originais do Drive para 'documentos'...")
|
457 |
-
download_original_files_from_folder(
|
|
|
|
|
|
|
|
|
458 |
else:
|
459 |
logging.info("'documentos' já contém arquivos, ignorando download.")
|
460 |
|
461 |
-
#
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
file_extractor=
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
docstore
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
-
# Configuração do StorageContext
|
482 |
storage_context = StorageContext.from_defaults(
|
483 |
-
docstore=docstore,
|
|
|
484 |
)
|
485 |
|
486 |
-
#
|
487 |
-
|
488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
else:
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
)
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
else:
|
504 |
-
bm25_retriever =
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
506 |
similarity_top_k=2,
|
507 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
)
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
"{context_str}"
|
544 |
-
"\nInstrução: Use o histórico da conversa anterior, ou o contexto acima, para responder."
|
545 |
-
),
|
546 |
-
verbose=True,
|
547 |
-
)
|
548 |
-
|
549 |
-
# Armazenamento do chat
|
550 |
-
chat_store = SimpleChatStore()
|
551 |
-
if os.path.exists(chat_store_path):
|
552 |
-
chat_store = SimpleChatStore.from_persist_path(persist_path=chat_store_path)
|
553 |
else:
|
554 |
-
chat_store.
|
555 |
|
556 |
-
|
|
|
|
|
|
|
557 |
st.title("Chatbot Carômetro")
|
558 |
-
st.write("Este
|
559 |
|
560 |
if 'chat_history' not in st.session_state:
|
561 |
st.session_state.chat_history = []
|
@@ -567,23 +528,18 @@ for message in st.session_state.chat_history:
|
|
567 |
|
568 |
user_input = st.chat_input("Digite sua pergunta")
|
569 |
if user_input:
|
570 |
-
# Exibir a mensagem do usuário e adicionar ao histórico
|
571 |
with st.chat_message('user'):
|
572 |
st.write(user_input)
|
573 |
st.session_state.chat_history.append(f"user: {user_input}")
|
574 |
|
575 |
-
# Placeholder para a mensagem do assistente
|
576 |
with st.chat_message('assistant'):
|
577 |
message_placeholder = st.empty()
|
578 |
assistant_message = ''
|
579 |
|
580 |
-
# Obter a resposta em streaming do chat_engine
|
581 |
response = chat_engine.stream_chat(user_input)
|
582 |
for token in response.response_gen:
|
583 |
assistant_message += token
|
584 |
-
# Atualizar o placeholder da mensagem
|
585 |
message_placeholder.markdown(assistant_message + "▌")
|
586 |
|
587 |
-
# Remover o cursor após a conclusão
|
588 |
message_placeholder.markdown(assistant_message)
|
589 |
-
st.session_state.chat_history.append(f"assistant: {assistant_message}")
|
|
|
1 |
import logging
|
2 |
import sys
|
3 |
import os
|
4 |
+
|
5 |
import re
|
6 |
import base64
|
7 |
import nest_asyncio
|
8 |
+
nest_asyncio.apply()
|
9 |
import pandas as pd
|
10 |
from pathlib import Path
|
11 |
from typing import Any, Dict, List, Optional
|
12 |
from PIL import Image
|
13 |
import streamlit as st
|
14 |
import torch
|
15 |
+
|
16 |
+
# Imports do LlamaIndex
|
17 |
+
from llama_index.core import (
|
18 |
+
Settings,
|
19 |
+
SimpleDirectoryReader,
|
20 |
+
StorageContext,
|
21 |
+
Document
|
22 |
+
)
|
23 |
+
|
24 |
from llama_index.core.storage.docstore import SimpleDocumentStore
|
|
|
|
|
25 |
from llama_index.core.node_parser import LangchainNodeParser
|
26 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
27 |
from llama_index.core.storage.chat_store import SimpleChatStore
|
28 |
from llama_index.core.memory import ChatMemoryBuffer
|
29 |
from llama_index.core.query_engine import RetrieverQueryEngine
|
30 |
from llama_index.core.chat_engine import CondensePlusContextChatEngine
|
|
|
31 |
from llama_index.core.retrievers import QueryFusionRetriever
|
32 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
33 |
from llama_index.core import VectorStoreIndex
|
34 |
+
|
|
|
|
|
35 |
import chromadb
|
36 |
|
37 |
###############################################################################
|
38 |
# MONKEY PATCH EM bm25s #
|
39 |
###############################################################################
|
40 |
import bm25s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
###############################################################################
|
42 |
# CLASSE BM25Retriever (AJUSTADA PARA ENCODING) #
|
43 |
###############################################################################
|
|
|
218 |
|
219 |
return nodes
|
220 |
|
|
|
|
|
|
|
221 |
|
222 |
+
###############################################################################
|
223 |
+
# CONFIGURAÇÃO STREAMLIT E AJUSTES DA PIPELINE #
|
224 |
+
###############################################################################
|
225 |
+
# Evite reindexar ou baixar dados repetidamente armazenando o estado na sessão.
|
226 |
+
im = Image.open("pngegg.png")
|
227 |
+
st.set_page_config(page_title="Chatbot Carômetro", page_icon=im, layout="wide")
|
228 |
|
229 |
+
# Seções laterais (sidebar)
|
230 |
st.sidebar.title("Configuração de LLM")
|
231 |
sidebar_option = st.sidebar.radio("Selecione o LLM", ["gpt-3.5-turbo"])
|
232 |
|
|
|
|
|
233 |
import base64
|
|
|
|
|
234 |
with open("sicoob-logo.png", "rb") as f:
|
235 |
data = base64.b64encode(f.read()).decode("utf-8")
|
|
|
236 |
st.sidebar.markdown(
|
237 |
f"""
|
238 |
<div style="display:table;margin-top:-80%;margin-left:0%;">
|
|
|
242 |
unsafe_allow_html=True,
|
243 |
)
|
244 |
|
|
|
|
|
|
|
245 |
if sidebar_option == "gpt-3.5-turbo":
|
246 |
from llama_index.llms.openai import OpenAI
|
247 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
248 |
Settings.llm = OpenAI(model="gpt-3.5-turbo")
|
249 |
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
else:
|
251 |
raise Exception("Opção de LLM inválida!")
|
252 |
|
253 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
254 |
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
|
255 |
|
256 |
+
# Caminhos principais
|
|
|
257 |
chat_store_path = os.path.join("chat_store", "chat_store.json")
|
258 |
+
documents_path = "documentos"
|
259 |
+
chroma_storage_path = "chroma_db"
|
260 |
+
bm25_persist_path = "bm25_retriever"
|
261 |
|
262 |
+
# Classe CSV customizada
|
263 |
class CustomPandasCSVReader:
|
264 |
"""PandasCSVReader modificado para incluir cabeçalhos nos documentos."""
|
265 |
def __init__(
|
|
|
301 |
for text in text_list
|
302 |
]
|
303 |
|
304 |
+
def clean_documents(documents: List[Document]) -> List[Document]:
|
305 |
+
"""Remove caracteres indesejados diretamente nos textos."""
|
306 |
+
cleaned_docs = []
|
307 |
for doc in documents:
|
308 |
cleaned_text = re.sub(r"[^0-9A-Za-zÀ-ÿ ]", "", doc.get_content())
|
309 |
doc.text = cleaned_text
|
310 |
+
cleaned_docs.append(doc)
|
311 |
+
return cleaned_docs
|
312 |
|
313 |
+
def are_docs_downloaded(directory_path: str) -> bool:
|
314 |
+
"""Verifica se o diretório tem algum arquivo."""
|
315 |
+
return os.path.isdir(directory_path) and any(os.scandir(directory_path))
|
316 |
+
|
317 |
+
# Simula a leitura de arquivos do Google Drive
|
318 |
from llama_index.readers.google import GoogleDriveReader
|
319 |
import json
|
320 |
|
|
|
336 |
google_drive_reader = GoogleDriveReader(credentials_path=credentials_path)
|
337 |
google_drive_reader._creds = google_drive_reader._get_credentials()
|
338 |
|
339 |
+
def download_original_files_from_folder(
|
340 |
+
greader: GoogleDriveReader,
|
341 |
+
pasta_documentos_drive: str,
|
342 |
+
local_path: str
|
343 |
+
):
|
344 |
+
"""Faz download dos arquivos apenas se não existirem localmente."""
|
345 |
os.makedirs(local_path, exist_ok=True)
|
346 |
files_meta = greader._get_fileids_meta(folder_id=pasta_documentos_drive)
|
347 |
if not files_meta:
|
|
|
362 |
else:
|
363 |
logging.warning(f"Não foi possível baixar '{file_name}'")
|
364 |
|
365 |
+
# Pasta do Drive
|
366 |
pasta_documentos_drive = "1s0UUANcU1B0D2eyRweb1W5idUn1V5JEh"
|
367 |
|
368 |
+
###############################################################################
|
369 |
+
# CRIAÇÃO/CARREGAMENTO DE RECURSOS (evita repetição de etapas) #
|
370 |
+
###############################################################################
|
371 |
+
# 1. Garantir que não baixamos dados novamente se eles já existem.
|
372 |
if not are_docs_downloaded(documents_path):
|
373 |
logging.info("Baixando arquivos originais do Drive para 'documentos'...")
|
374 |
+
download_original_files_from_folder(
|
375 |
+
google_drive_reader,
|
376 |
+
pasta_documentos_drive,
|
377 |
+
documents_path
|
378 |
+
)
|
379 |
else:
|
380 |
logging.info("'documentos' já contém arquivos, ignorando download.")
|
381 |
|
382 |
+
# 2. Se ainda não existir docstore e index no estado da sessão, criamos.
|
383 |
+
# Caso contrário, apenas reutilizamos o que já existe.
|
384 |
+
if "docstore" not in st.session_state:
|
385 |
+
# Carregar documentos do diretório local
|
386 |
+
file_extractor = {".csv": CustomPandasCSVReader()}
|
387 |
+
documents = SimpleDirectoryReader(
|
388 |
+
input_dir=documents_path,
|
389 |
+
file_extractor=file_extractor,
|
390 |
+
filename_as_id=True,
|
391 |
+
recursive=True
|
392 |
+
).load_data()
|
393 |
+
|
394 |
+
documents = clean_documents(documents)
|
395 |
+
|
396 |
+
# Cria docstore
|
397 |
+
docstore = SimpleDocumentStore()
|
398 |
+
docstore.add_documents(documents)
|
399 |
+
|
400 |
+
st.session_state["docstore"] = docstore
|
401 |
+
else:
|
402 |
+
docstore = st.session_state["docstore"]
|
403 |
+
|
404 |
+
# 3. Configuramos o VectorStore + Chroma sem recriar se já estiver pronto.
|
405 |
+
if "vector_store" not in st.session_state:
|
406 |
+
db = chromadb.PersistentClient(path=chroma_storage_path)
|
407 |
+
chroma_collection = db.get_or_create_collection("dense_vectors")
|
408 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
409 |
+
st.session_state["vector_store"] = vector_store
|
410 |
+
else:
|
411 |
+
vector_store = st.session_state["vector_store"]
|
412 |
|
|
|
413 |
storage_context = StorageContext.from_defaults(
|
414 |
+
docstore=docstore,
|
415 |
+
vector_store=vector_store
|
416 |
)
|
417 |
|
418 |
+
# 4. Carregamos ou criamos o índice. Se já existe a base do Chroma, supõe-se
|
419 |
+
# que o índice foi persistido. Caso contrário, cria-se.
|
420 |
+
if "index" not in st.session_state:
|
421 |
+
if os.path.exists(chroma_storage_path) and os.listdir(chroma_storage_path):
|
422 |
+
# Há dados salvos, então criamos índice a partir do vector_store
|
423 |
+
index = VectorStoreIndex.from_vector_store(vector_store)
|
424 |
+
else:
|
425 |
+
# Cria índice (chunk_size pode ser configurado conforme necessidade)
|
426 |
+
splitter = LangchainNodeParser(
|
427 |
+
RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
|
428 |
+
)
|
429 |
+
index = VectorStoreIndex.from_documents(
|
430 |
+
list(docstore.docs.values()),
|
431 |
+
storage_context=storage_context,
|
432 |
+
transformations=[splitter]
|
433 |
+
)
|
434 |
+
vector_store.persist()
|
435 |
+
st.session_state["index"] = index
|
436 |
else:
|
437 |
+
index = st.session_state["index"]
|
438 |
+
|
439 |
+
# 5. Criação ou carregamento do BM25Retriever customizado
|
440 |
+
if "bm25_retriever" not in st.session_state:
|
441 |
+
if (
|
442 |
+
os.path.exists(bm25_persist_path)
|
443 |
+
and os.path.exists(os.path.join(bm25_persist_path, "bm25.index.json"))
|
444 |
+
):
|
445 |
+
bm25_retriever = BM25Retriever.from_persist_dir(bm25_persist_path)
|
446 |
+
else:
|
447 |
+
bm25_retriever = BM25Retriever.from_defaults(
|
448 |
+
docstore=docstore,
|
449 |
+
similarity_top_k=2,
|
450 |
+
language="portuguese",
|
451 |
+
verbose=True
|
452 |
+
)
|
453 |
+
os.makedirs(bm25_persist_path, exist_ok=True)
|
454 |
+
bm25_retriever.persist(bm25_persist_path)
|
455 |
+
st.session_state["bm25_retriever"] = bm25_retriever
|
456 |
else:
|
457 |
+
bm25_retriever = st.session_state["bm25_retriever"]
|
458 |
+
|
459 |
+
# 6. Criamos ou recuperamos o retriever que fará Query Fusion (BM25 + eventual vetor)
|
460 |
+
if "fusion_retriever" not in st.session_state:
|
461 |
+
vector_retriever = index.as_retriever(similarity_top_k=2)
|
462 |
+
fusion_retriever = QueryFusionRetriever(
|
463 |
+
[bm25_retriever, vector_retriever],
|
464 |
similarity_top_k=2,
|
465 |
+
num_queries=0,
|
466 |
+
mode="reciprocal_rerank",
|
467 |
+
use_async=True,
|
468 |
+
verbose=True,
|
469 |
+
query_gen_prompt=(
|
470 |
+
"Gere {num_queries} perguntas de busca relacionadas à seguinte pergunta. "
|
471 |
+
"Priorize o significado da pergunta sobre qualquer histórico de conversa. "
|
472 |
+
"Se o histórico não for relevante, ignore-o. "
|
473 |
+
"Não adicione explicações ou introduções. Apenas escreva as perguntas. "
|
474 |
+
"Pergunta: {query}\n\nPerguntas:\n"
|
475 |
+
),
|
476 |
)
|
477 |
+
st.session_state["fusion_retriever"] = fusion_retriever
|
478 |
+
else:
|
479 |
+
fusion_retriever = st.session_state["fusion_retriever"]
|
480 |
+
|
481 |
+
# 7. Configura o Chat Engine caso ainda não esteja na sessão
|
482 |
+
if "chat_engine" not in st.session_state:
|
483 |
+
nest_asyncio.apply()
|
484 |
+
memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
|
485 |
+
query_engine = RetrieverQueryEngine.from_args(fusion_retriever)
|
486 |
+
|
487 |
+
chat_engine = CondensePlusContextChatEngine.from_defaults(
|
488 |
+
query_engine,
|
489 |
+
memory=memory,
|
490 |
+
context_prompt=(
|
491 |
+
"Você é um assistente virtual capaz de interagir normalmente, além de "
|
492 |
+
"fornecer informações sobre organogramas e listar funcionários. "
|
493 |
+
"Aqui estão os documentos relevantes para o contexto:\n"
|
494 |
+
"{context_str}\n"
|
495 |
+
"Use o histórico anterior ou o contexto acima para responder."
|
496 |
+
),
|
497 |
+
verbose=True,
|
498 |
+
)
|
499 |
+
st.session_state["chat_engine"] = chat_engine
|
500 |
+
else:
|
501 |
+
chat_engine = st.session_state["chat_engine"]
|
502 |
+
|
503 |
+
# 8. Armazenamento do chat
|
504 |
+
if "chat_store" not in st.session_state:
|
505 |
+
if os.path.exists(chat_store_path):
|
506 |
+
chat_store = SimpleChatStore.from_persist_path(persist_path=chat_store_path)
|
507 |
+
else:
|
508 |
+
chat_store = SimpleChatStore()
|
509 |
+
chat_store.persist(persist_path=chat_store_path)
|
510 |
+
st.session_state["chat_store"] = chat_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
else:
|
512 |
+
chat_store = st.session_state["chat_store"]
|
513 |
|
514 |
+
|
515 |
+
###############################################################################
|
516 |
+
# INTERFACE DO CHAT EM STREAMLIT #
|
517 |
+
###############################################################################
|
518 |
st.title("Chatbot Carômetro")
|
519 |
+
st.write("Este assistente virtual pode te ajudar a encontrar informações relevantes sobre os carômetros da Sicoob.")
|
520 |
|
521 |
if 'chat_history' not in st.session_state:
|
522 |
st.session_state.chat_history = []
|
|
|
528 |
|
529 |
user_input = st.chat_input("Digite sua pergunta")
|
530 |
if user_input:
|
|
|
531 |
with st.chat_message('user'):
|
532 |
st.write(user_input)
|
533 |
st.session_state.chat_history.append(f"user: {user_input}")
|
534 |
|
|
|
535 |
with st.chat_message('assistant'):
|
536 |
message_placeholder = st.empty()
|
537 |
assistant_message = ''
|
538 |
|
|
|
539 |
response = chat_engine.stream_chat(user_input)
|
540 |
for token in response.response_gen:
|
541 |
assistant_message += token
|
|
|
542 |
message_placeholder.markdown(assistant_message + "▌")
|
543 |
|
|
|
544 |
message_placeholder.markdown(assistant_message)
|
545 |
+
st.session_state.chat_history.append(f"assistant: {assistant_message}")
|