Melhorias no carregamento dos documentos

#4
Files changed (1) hide show
  1. app.py +178 -222
app.py CHANGED
@@ -1,86 +1,43 @@
1
  import logging
2
  import sys
3
  import os
 
4
  import re
5
  import base64
6
  import nest_asyncio
 
7
  import pandas as pd
8
  from pathlib import Path
9
  from typing import Any, Dict, List, Optional
10
  from PIL import Image
11
  import streamlit as st
12
  import torch
13
- from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, Document
 
 
 
 
 
 
 
 
14
  from llama_index.core.storage.docstore import SimpleDocumentStore
15
- # from llama_index.llms.ollama import Ollama
16
- # from llama_index.embeddings.ollama import OllamaEmbedding
17
  from llama_index.core.node_parser import LangchainNodeParser
18
  from langchain.text_splitter import RecursiveCharacterTextSplitter
19
  from llama_index.core.storage.chat_store import SimpleChatStore
20
  from llama_index.core.memory import ChatMemoryBuffer
21
  from llama_index.core.query_engine import RetrieverQueryEngine
22
  from llama_index.core.chat_engine import CondensePlusContextChatEngine
23
- #from llama_index.retrievers.bm25 import BM25Retriever
24
  from llama_index.core.retrievers import QueryFusionRetriever
25
  from llama_index.vector_stores.chroma import ChromaVectorStore
26
  from llama_index.core import VectorStoreIndex
27
- # from llama_index.llms.huggingface import HuggingFaceLLM
28
- # from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
29
- # from llama_index.embeddings.huggingface import HuggingFaceEmbedding
30
  import chromadb
31
 
32
  ###############################################################################
33
  # MONKEY PATCH EM bm25s #
34
  ###############################################################################
35
  import bm25s
36
-
37
- # Guardamos a referência da função original
38
- orig_find_newline_positions = bm25s.utils.corpus.find_newline_positions
39
-
40
- def patched_find_newline_positions(path, show_progress=True, leave_progress=True):
41
- """
42
- Versão 'gambiarra' da função original, forçando uso de encoding='utf-8'
43
- e ignorando erros de decodificação. Assim, evitamos UnicodeDecodeError
44
- mesmo que o arquivo contenha caracteres fora da faixa UTF-8.
45
-
46
- (Esta referência é real, baseada em ajustes de leitura de arquivos do Python.)
47
- """
48
- path = str(path)
49
- indexes = []
50
-
51
- with open(path, "r", encoding="utf-8", errors="ignore") as f:
52
- indexes.append(f.tell())
53
- file_size = os.path.getsize(path)
54
-
55
- try:
56
- from tqdm.auto import tqdm
57
- pbar = tqdm(
58
- total=file_size,
59
- desc="Finding newlines for mmindex",
60
- unit="B",
61
- unit_scale=True,
62
- leave=leave_progress,
63
- disable=not show_progress,
64
- )
65
- except ImportError:
66
- pbar = None
67
-
68
- while True:
69
- line = f.readline()
70
- if not line:
71
- break
72
- t = f.tell()
73
- indexes.append(t)
74
- if pbar is not None:
75
- pbar.update(t - indexes[-2])
76
-
77
- if pbar is not None:
78
- pbar.close()
79
-
80
- return indexes[:-1]
81
-
82
- # Aplicamos nosso patch
83
- bm25s.utils.corpus.find_newline_positions = patched_find_newline_positions
84
  ###############################################################################
85
  # CLASSE BM25Retriever (AJUSTADA PARA ENCODING) #
86
  ###############################################################################
@@ -261,28 +218,21 @@ class BM25Retriever(BaseRetriever):
261
 
262
  return nodes
263
 
264
- #Configuração da imagem da aba
265
- im = Image.open("pngegg.png")
266
- st.set_page_config(page_title = "Chatbot Carômetro", page_icon=im, layout = "wide")
267
 
268
- #Removido loop e adicionado os.makedirs
269
- os.makedirs("bm25_retriever", exist_ok=True)
270
- os.makedirs("chat_store", exist_ok=True)
271
- os.makedirs("chroma_db", exist_ok=True)
272
- os.makedirs("documentos", exist_ok=True)
 
273
 
274
- # Configuração do Streamlit
275
  st.sidebar.title("Configuração de LLM")
276
  sidebar_option = st.sidebar.radio("Selecione o LLM", ["gpt-3.5-turbo"])
277
 
278
- # logo_url = 'app\logos\logo-sicoob.jpg'
279
- # st.sidebar.image(logo_url)
280
  import base64
281
-
282
- #Configuração da imagem da sidebar
283
  with open("sicoob-logo.png", "rb") as f:
284
  data = base64.b64encode(f.read()).decode("utf-8")
285
-
286
  st.sidebar.markdown(
287
  f"""
288
  <div style="display:table;margin-top:-80%;margin-left:0%;">
@@ -292,67 +242,24 @@ with open("sicoob-logo.png", "rb") as f:
292
  unsafe_allow_html=True,
293
  )
294
 
295
- #if sidebar_option == "Ollama":
296
- # Settings.llm = Ollama(model="llama3.2:latest", request_timeout=500.0, num_gpu=1)
297
- # Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text:latest")
298
  if sidebar_option == "gpt-3.5-turbo":
299
  from llama_index.llms.openai import OpenAI
300
  from llama_index.embeddings.openai import OpenAIEmbedding
301
  Settings.llm = OpenAI(model="gpt-3.5-turbo")
302
  Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
303
- # elif sidebar_option == 'NuExtract-1.5':
304
- # #Embedding do huggingface
305
- # Settings.embed_model = HuggingFaceEmbedding(
306
- # model_name="BAAI/bge-small-en-v1.5"
307
- # )
308
- # #Carregamento do modelo local, descomentar o modelo desejado
309
-
310
- # llm = HuggingFaceLLM(
311
- # context_window=2048,
312
- # max_new_tokens=2048,
313
- # generate_kwargs={"do_sample": False},
314
- # #query_wrapper_prompt=query_wrapper_prompt,
315
- # #model_name="Qwen/Qwen2.5-Coder-32B-Instruct",
316
- # #model_name="Qwen/Qwen2.5-14B-Instruct",
317
- # # model_name="meta-llama/Llama-3.2-3B",
318
- # #model_name="HuggingFaceH4/zephyr-7b-beta",
319
- # # model_name="meta-llama/Meta-Llama-3-8B",
320
- # model_name="numind/NuExtract-1.5",
321
- # #model_name="meta-llama/Llama-3.2-3B",
322
- # tokenizer_name="numind/NuExtract-1.5",
323
- # device_map="auto",
324
- # tokenizer_kwargs={"max_length": 512},
325
- # # uncomment this if using CUDA to reduce memory usage
326
- # model_kwargs={"torch_dtype": torch.bfloat16},
327
- # )
328
- # chat = [
329
- # {"role": "user", "content": "Hello, how are you?"},
330
- # {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
331
- # {"role": "user", "content": "I'd like to show off how chat templating works!"},
332
- # ]
333
-
334
- # from transformers import AutoTokenizer
335
-
336
- # tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract-1.5")
337
- # tokenizer.apply_chat_template(chat, tokenize=False)
338
-
339
- # Settings.chunk_size = 512
340
- # Settings.llm = llm
341
-
342
  else:
343
  raise Exception("Opção de LLM inválida!")
344
 
345
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
346
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
347
 
348
-
349
- # Diretórios configurados pelo usuário
350
  chat_store_path = os.path.join("chat_store", "chat_store.json")
351
- documents_path = os.path.join("documentos")
352
- chroma_storage_path = os.path.join("chroma_db") # Diretório para persistência do Chroma
353
- bm25_persist_path = os.path.join("bm25_retriever")
354
 
355
- # Classe CSV Customizada (novo código)
356
  class CustomPandasCSVReader:
357
  """PandasCSVReader modificado para incluir cabeçalhos nos documentos."""
358
  def __init__(
@@ -394,15 +301,20 @@ class CustomPandasCSVReader:
394
  for text in text_list
395
  ]
396
 
397
- def clean_documents(documents):
398
- """Remove caracteres não desejados diretamente nos textos dos documentos."""
399
- cleaned_documents = []
400
  for doc in documents:
401
  cleaned_text = re.sub(r"[^0-9A-Za-zÀ-ÿ ]", "", doc.get_content())
402
  doc.text = cleaned_text
403
- cleaned_documents.append(doc)
404
- return cleaned_documents
405
 
 
 
 
 
 
406
  from llama_index.readers.google import GoogleDriveReader
407
  import json
408
 
@@ -424,10 +336,12 @@ with open(token_path, 'w') as credentials_file:
424
  google_drive_reader = GoogleDriveReader(credentials_path=credentials_path)
425
  google_drive_reader._creds = google_drive_reader._get_credentials()
426
 
427
- def are_docs_downloaded(directory_path: str) -> bool:
428
- return os.path.isdir(directory_path) and any(os.scandir(directory_path))
429
-
430
- def download_original_files_from_folder(greader: GoogleDriveReader, pasta_documentos_drive: str, local_path: str):
 
 
431
  os.makedirs(local_path, exist_ok=True)
432
  files_meta = greader._get_fileids_meta(folder_id=pasta_documentos_drive)
433
  if not files_meta:
@@ -448,114 +362,161 @@ def download_original_files_from_folder(greader: GoogleDriveReader, pasta_docume
448
  else:
449
  logging.warning(f"Não foi possível baixar '{file_name}'")
450
 
451
- #DADOS/QA_database/Documentos CSV/documentos
452
  pasta_documentos_drive = "1s0UUANcU1B0D2eyRweb1W5idUn1V5JEh"
453
 
454
- # Verifica e baixa arquivos se necessário (novo código)
 
 
 
455
  if not are_docs_downloaded(documents_path):
456
  logging.info("Baixando arquivos originais do Drive para 'documentos'...")
457
- download_original_files_from_folder(google_drive_reader, pasta_documentos_drive, documents_path)
 
 
 
 
458
  else:
459
  logging.info("'documentos' já contém arquivos, ignorando download.")
460
 
461
- # Configuração de leitura de documentos
462
- file_extractor = {".csv": CustomPandasCSVReader()}
463
- documents = SimpleDirectoryReader(
464
- input_dir=documents_path,
465
- file_extractor=file_extractor,
466
- filename_as_id=True,
467
- recursive=True
468
- #Recursive caso tenha varias pastas no drive
469
- ).load_data()
470
-
471
- documents = clean_documents(documents)
472
-
473
- # Configuração do Chroma e BM25 com persistência
474
- docstore = SimpleDocumentStore()
475
- docstore.add_documents(documents)
476
-
477
- db = chromadb.PersistentClient(path=chroma_storage_path)
478
- chroma_collection = db.get_or_create_collection("dense_vectors")
479
- vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
 
 
 
 
 
 
 
 
 
 
 
480
 
481
- # Configuração do StorageContext
482
  storage_context = StorageContext.from_defaults(
483
- docstore=docstore, vector_store=vector_store
 
484
  )
485
 
486
- # Criação/Recarregamento do índice com embeddings
487
- if os.path.exists(chroma_storage_path):
488
- index = VectorStoreIndex.from_vector_store(vector_store)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  else:
490
- splitter = LangchainNodeParser(
491
- RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
492
- )
493
- index = VectorStoreIndex.from_documents(
494
- documents,
495
- storage_context=storage_context,
496
- transformations=[splitter]
497
- )
498
- vector_store.persist()
499
-
500
- # Criação/Recarregamento do BM25 Retriever
501
- if os.path.exists(os.path.join(bm25_persist_path, "params.index.json")):
502
- bm25_retriever = BM25Retriever.from_persist_dir(bm25_persist_path)
 
 
 
 
 
 
503
  else:
504
- bm25_retriever = BM25Retriever.from_defaults(
505
- docstore=docstore,
 
 
 
 
 
506
  similarity_top_k=2,
507
- language="portuguese", # Idioma ajustado para seu caso
 
 
 
 
 
 
 
 
 
 
508
  )
509
- os.makedirs(bm25_persist_path, exist_ok=True)
510
- bm25_retriever.persist(bm25_persist_path)
511
-
512
- # Combinação de Retrievers (Embeddings + BM25)
513
- vector_retriever = index.as_retriever(similarity_top_k=2)
514
- retriever = QueryFusionRetriever(
515
- [vector_retriever, bm25_retriever],
516
- similarity_top_k=3,
517
- num_queries=0,
518
- mode="reciprocal_rerank",
519
- use_async=True,
520
- verbose=True,
521
- query_gen_prompt=(
522
- "Gere {num_queries} perguntas de busca relacionadas à seguinte pergunta. "
523
- "Priorize o significado da pergunta sobre qualquer histórico de conversa. "
524
- "Se o histórico não for relevante para a pergunta, ignore-o. "
525
- "Não adicione explicações, notas ou introduções. Apenas escreva as perguntas. "
526
- "Pergunta: {query}\n\n"
527
- "Perguntas:\n"
528
- ),
529
- )
530
-
531
- # Configuração do chat engine
532
- nest_asyncio.apply()
533
- memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
534
- query_engine = RetrieverQueryEngine.from_args(retriever)
535
- chat_engine = CondensePlusContextChatEngine.from_defaults(
536
- query_engine,
537
- memory=memory,
538
- chat_mode="context",
539
- context_prompt=(
540
- "Você é um assistente virtual capaz de interagir normalmente, além de"
541
- " fornecer informações sobre organogramas e listar funcionários."
542
- " Aqui estão os documentos relevantes para o contexto:\n"
543
- "{context_str}"
544
- "\nInstrução: Use o histórico da conversa anterior, ou o contexto acima, para responder."
545
- ),
546
- verbose=True,
547
- )
548
-
549
- # Armazenamento do chat
550
- chat_store = SimpleChatStore()
551
- if os.path.exists(chat_store_path):
552
- chat_store = SimpleChatStore.from_persist_path(persist_path=chat_store_path)
553
  else:
554
- chat_store.persist(persist_path=chat_store_path)
555
 
556
- # Interface do Chatbot
 
 
 
557
  st.title("Chatbot Carômetro")
558
- st.write("Este chatbot pode te ajudar a conseguir informações relevantes sobre os carômetros da Sicoob.")
559
 
560
  if 'chat_history' not in st.session_state:
561
  st.session_state.chat_history = []
@@ -567,23 +528,18 @@ for message in st.session_state.chat_history:
567
 
568
  user_input = st.chat_input("Digite sua pergunta")
569
  if user_input:
570
- # Exibir a mensagem do usuário e adicionar ao histórico
571
  with st.chat_message('user'):
572
  st.write(user_input)
573
  st.session_state.chat_history.append(f"user: {user_input}")
574
 
575
- # Placeholder para a mensagem do assistente
576
  with st.chat_message('assistant'):
577
  message_placeholder = st.empty()
578
  assistant_message = ''
579
 
580
- # Obter a resposta em streaming do chat_engine
581
  response = chat_engine.stream_chat(user_input)
582
  for token in response.response_gen:
583
  assistant_message += token
584
- # Atualizar o placeholder da mensagem
585
  message_placeholder.markdown(assistant_message + "▌")
586
 
587
- # Remover o cursor após a conclusão
588
  message_placeholder.markdown(assistant_message)
589
- st.session_state.chat_history.append(f"assistant: {assistant_message}")
 
1
  import logging
2
  import sys
3
  import os
4
+
5
  import re
6
  import base64
7
  import nest_asyncio
8
+ nest_asyncio.apply()
9
  import pandas as pd
10
  from pathlib import Path
11
  from typing import Any, Dict, List, Optional
12
  from PIL import Image
13
  import streamlit as st
14
  import torch
15
+
16
+ # Imports do LlamaIndex
17
+ from llama_index.core import (
18
+ Settings,
19
+ SimpleDirectoryReader,
20
+ StorageContext,
21
+ Document
22
+ )
23
+
24
  from llama_index.core.storage.docstore import SimpleDocumentStore
 
 
25
  from llama_index.core.node_parser import LangchainNodeParser
26
  from langchain.text_splitter import RecursiveCharacterTextSplitter
27
  from llama_index.core.storage.chat_store import SimpleChatStore
28
  from llama_index.core.memory import ChatMemoryBuffer
29
  from llama_index.core.query_engine import RetrieverQueryEngine
30
  from llama_index.core.chat_engine import CondensePlusContextChatEngine
 
31
  from llama_index.core.retrievers import QueryFusionRetriever
32
  from llama_index.vector_stores.chroma import ChromaVectorStore
33
  from llama_index.core import VectorStoreIndex
34
+
 
 
35
  import chromadb
36
 
37
  ###############################################################################
38
  # MONKEY PATCH EM bm25s #
39
  ###############################################################################
40
  import bm25s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  ###############################################################################
42
  # CLASSE BM25Retriever (AJUSTADA PARA ENCODING) #
43
  ###############################################################################
 
218
 
219
  return nodes
220
 
 
 
 
221
 
222
+ ###############################################################################
223
+ # CONFIGURAÇÃO STREAMLIT E AJUSTES DA PIPELINE #
224
+ ###############################################################################
225
+ # Evite reindexar ou baixar dados repetidamente armazenando o estado na sessão.
226
+ im = Image.open("pngegg.png")
227
+ st.set_page_config(page_title="Chatbot Carômetro", page_icon=im, layout="wide")
228
 
229
+ # Seções laterais (sidebar)
230
  st.sidebar.title("Configuração de LLM")
231
  sidebar_option = st.sidebar.radio("Selecione o LLM", ["gpt-3.5-turbo"])
232
 
 
 
233
  import base64
 
 
234
  with open("sicoob-logo.png", "rb") as f:
235
  data = base64.b64encode(f.read()).decode("utf-8")
 
236
  st.sidebar.markdown(
237
  f"""
238
  <div style="display:table;margin-top:-80%;margin-left:0%;">
 
242
  unsafe_allow_html=True,
243
  )
244
 
 
 
 
245
  if sidebar_option == "gpt-3.5-turbo":
246
  from llama_index.llms.openai import OpenAI
247
  from llama_index.embeddings.openai import OpenAIEmbedding
248
  Settings.llm = OpenAI(model="gpt-3.5-turbo")
249
  Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  else:
251
  raise Exception("Opção de LLM inválida!")
252
 
253
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
254
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
255
 
256
+ # Caminhos principais
 
257
  chat_store_path = os.path.join("chat_store", "chat_store.json")
258
+ documents_path = "documentos"
259
+ chroma_storage_path = "chroma_db"
260
+ bm25_persist_path = "bm25_retriever"
261
 
262
+ # Classe CSV customizada
263
  class CustomPandasCSVReader:
264
  """PandasCSVReader modificado para incluir cabeçalhos nos documentos."""
265
  def __init__(
 
301
  for text in text_list
302
  ]
303
 
304
+ def clean_documents(documents: List[Document]) -> List[Document]:
305
+ """Remove caracteres indesejados diretamente nos textos."""
306
+ cleaned_docs = []
307
  for doc in documents:
308
  cleaned_text = re.sub(r"[^0-9A-Za-zÀ-ÿ ]", "", doc.get_content())
309
  doc.text = cleaned_text
310
+ cleaned_docs.append(doc)
311
+ return cleaned_docs
312
 
313
+ def are_docs_downloaded(directory_path: str) -> bool:
314
+ """Verifica se o diretório tem algum arquivo."""
315
+ return os.path.isdir(directory_path) and any(os.scandir(directory_path))
316
+
317
+ # Simula a leitura de arquivos do Google Drive
318
  from llama_index.readers.google import GoogleDriveReader
319
  import json
320
 
 
336
  google_drive_reader = GoogleDriveReader(credentials_path=credentials_path)
337
  google_drive_reader._creds = google_drive_reader._get_credentials()
338
 
339
+ def download_original_files_from_folder(
340
+ greader: GoogleDriveReader,
341
+ pasta_documentos_drive: str,
342
+ local_path: str
343
+ ):
344
+ """Faz download dos arquivos apenas se não existirem localmente."""
345
  os.makedirs(local_path, exist_ok=True)
346
  files_meta = greader._get_fileids_meta(folder_id=pasta_documentos_drive)
347
  if not files_meta:
 
362
  else:
363
  logging.warning(f"Não foi possível baixar '{file_name}'")
364
 
365
+ # Pasta do Drive
366
  pasta_documentos_drive = "1s0UUANcU1B0D2eyRweb1W5idUn1V5JEh"
367
 
368
+ ###############################################################################
369
+ # CRIAÇÃO/CARREGAMENTO DE RECURSOS (evita repetição de etapas) #
370
+ ###############################################################################
371
+ # 1. Garantir que não baixamos dados novamente se eles já existem.
372
  if not are_docs_downloaded(documents_path):
373
  logging.info("Baixando arquivos originais do Drive para 'documentos'...")
374
+ download_original_files_from_folder(
375
+ google_drive_reader,
376
+ pasta_documentos_drive,
377
+ documents_path
378
+ )
379
  else:
380
  logging.info("'documentos' já contém arquivos, ignorando download.")
381
 
382
+ # 2. Se ainda não existir docstore e index no estado da sessão, criamos.
383
+ # Caso contrário, apenas reutilizamos o que já existe.
384
+ if "docstore" not in st.session_state:
385
+ # Carregar documentos do diretório local
386
+ file_extractor = {".csv": CustomPandasCSVReader()}
387
+ documents = SimpleDirectoryReader(
388
+ input_dir=documents_path,
389
+ file_extractor=file_extractor,
390
+ filename_as_id=True,
391
+ recursive=True
392
+ ).load_data()
393
+
394
+ documents = clean_documents(documents)
395
+
396
+ # Cria docstore
397
+ docstore = SimpleDocumentStore()
398
+ docstore.add_documents(documents)
399
+
400
+ st.session_state["docstore"] = docstore
401
+ else:
402
+ docstore = st.session_state["docstore"]
403
+
404
+ # 3. Configuramos o VectorStore + Chroma sem recriar se já estiver pronto.
405
+ if "vector_store" not in st.session_state:
406
+ db = chromadb.PersistentClient(path=chroma_storage_path)
407
+ chroma_collection = db.get_or_create_collection("dense_vectors")
408
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
409
+ st.session_state["vector_store"] = vector_store
410
+ else:
411
+ vector_store = st.session_state["vector_store"]
412
 
 
413
  storage_context = StorageContext.from_defaults(
414
+ docstore=docstore,
415
+ vector_store=vector_store
416
  )
417
 
418
+ # 4. Carregamos ou criamos o índice. Se já existe a base do Chroma, supõe-se
419
+ # que o índice foi persistido. Caso contrário, cria-se.
420
+ if "index" not in st.session_state:
421
+ if os.path.exists(chroma_storage_path) and os.listdir(chroma_storage_path):
422
+ # Há dados salvos, então criamos índice a partir do vector_store
423
+ index = VectorStoreIndex.from_vector_store(vector_store)
424
+ else:
425
+ # Cria índice (chunk_size pode ser configurado conforme necessidade)
426
+ splitter = LangchainNodeParser(
427
+ RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
428
+ )
429
+ index = VectorStoreIndex.from_documents(
430
+ list(docstore.docs.values()),
431
+ storage_context=storage_context,
432
+ transformations=[splitter]
433
+ )
434
+ vector_store.persist()
435
+ st.session_state["index"] = index
436
  else:
437
+ index = st.session_state["index"]
438
+
439
+ # 5. Criação ou carregamento do BM25Retriever customizado
440
+ if "bm25_retriever" not in st.session_state:
441
+ if (
442
+ os.path.exists(bm25_persist_path)
443
+ and os.path.exists(os.path.join(bm25_persist_path, "bm25.index.json"))
444
+ ):
445
+ bm25_retriever = BM25Retriever.from_persist_dir(bm25_persist_path)
446
+ else:
447
+ bm25_retriever = BM25Retriever.from_defaults(
448
+ docstore=docstore,
449
+ similarity_top_k=2,
450
+ language="portuguese",
451
+ verbose=True
452
+ )
453
+ os.makedirs(bm25_persist_path, exist_ok=True)
454
+ bm25_retriever.persist(bm25_persist_path)
455
+ st.session_state["bm25_retriever"] = bm25_retriever
456
  else:
457
+ bm25_retriever = st.session_state["bm25_retriever"]
458
+
459
+ # 6. Criamos ou recuperamos o retriever que fará Query Fusion (BM25 + eventual vetor)
460
+ if "fusion_retriever" not in st.session_state:
461
+ vector_retriever = index.as_retriever(similarity_top_k=2)
462
+ fusion_retriever = QueryFusionRetriever(
463
+ [bm25_retriever, vector_retriever],
464
  similarity_top_k=2,
465
+ num_queries=0,
466
+ mode="reciprocal_rerank",
467
+ use_async=True,
468
+ verbose=True,
469
+ query_gen_prompt=(
470
+ "Gere {num_queries} perguntas de busca relacionadas à seguinte pergunta. "
471
+ "Priorize o significado da pergunta sobre qualquer histórico de conversa. "
472
+ "Se o histórico não for relevante, ignore-o. "
473
+ "Não adicione explicações ou introduções. Apenas escreva as perguntas. "
474
+ "Pergunta: {query}\n\nPerguntas:\n"
475
+ ),
476
  )
477
+ st.session_state["fusion_retriever"] = fusion_retriever
478
+ else:
479
+ fusion_retriever = st.session_state["fusion_retriever"]
480
+
481
+ # 7. Configura o Chat Engine caso ainda não esteja na sessão
482
+ if "chat_engine" not in st.session_state:
483
+ nest_asyncio.apply()
484
+ memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
485
+ query_engine = RetrieverQueryEngine.from_args(fusion_retriever)
486
+
487
+ chat_engine = CondensePlusContextChatEngine.from_defaults(
488
+ query_engine,
489
+ memory=memory,
490
+ context_prompt=(
491
+ "Você é um assistente virtual capaz de interagir normalmente, além de "
492
+ "fornecer informações sobre organogramas e listar funcionários. "
493
+ "Aqui estão os documentos relevantes para o contexto:\n"
494
+ "{context_str}\n"
495
+ "Use o histórico anterior ou o contexto acima para responder."
496
+ ),
497
+ verbose=True,
498
+ )
499
+ st.session_state["chat_engine"] = chat_engine
500
+ else:
501
+ chat_engine = st.session_state["chat_engine"]
502
+
503
+ # 8. Armazenamento do chat
504
+ if "chat_store" not in st.session_state:
505
+ if os.path.exists(chat_store_path):
506
+ chat_store = SimpleChatStore.from_persist_path(persist_path=chat_store_path)
507
+ else:
508
+ chat_store = SimpleChatStore()
509
+ chat_store.persist(persist_path=chat_store_path)
510
+ st.session_state["chat_store"] = chat_store
 
 
 
 
 
 
 
 
 
 
511
  else:
512
+ chat_store = st.session_state["chat_store"]
513
 
514
+
515
+ ###############################################################################
516
+ # INTERFACE DO CHAT EM STREAMLIT #
517
+ ###############################################################################
518
  st.title("Chatbot Carômetro")
519
+ st.write("Este assistente virtual pode te ajudar a encontrar informações relevantes sobre os carômetros da Sicoob.")
520
 
521
  if 'chat_history' not in st.session_state:
522
  st.session_state.chat_history = []
 
528
 
529
  user_input = st.chat_input("Digite sua pergunta")
530
  if user_input:
 
531
  with st.chat_message('user'):
532
  st.write(user_input)
533
  st.session_state.chat_history.append(f"user: {user_input}")
534
 
 
535
  with st.chat_message('assistant'):
536
  message_placeholder = st.empty()
537
  assistant_message = ''
538
 
 
539
  response = chat_engine.stream_chat(user_input)
540
  for token in response.response_gen:
541
  assistant_message += token
 
542
  message_placeholder.markdown(assistant_message + "▌")
543
 
 
544
  message_placeholder.markdown(assistant_message)
545
+ st.session_state.chat_history.append(f"assistant: {assistant_message}")