Spaces:
Sleeping
Sleeping
issue chunck risolto
Browse files- app.py +5 -3
- app/document_handling.py +57 -20
- app/llm_handling.py +47 -29
- db/.DS_Store +0 -0
- ui/chunks_viewer_tab.py +104 -0
- ui/db_management_tab.py +2 -1
- ui/document_management_tab.py +8 -6
app.py
CHANGED
@@ -8,6 +8,7 @@ from ui.chatbot_tab import create_chatbot_tab
|
|
8 |
from ui.db_management_tab import create_db_management_tab
|
9 |
from ui.document_management_tab import create_document_management_tab
|
10 |
from ui.info_tab import create_info_tab # Importa la nuova tab
|
|
|
11 |
|
12 |
# Configura il logging
|
13 |
configure_logging()
|
@@ -24,21 +25,22 @@ def main():
|
|
24 |
info_refs = create_info_tab()
|
25 |
chat_refs = create_chatbot_tab()
|
26 |
doc_refs = create_document_management_tab()
|
27 |
-
|
28 |
db_refs = create_db_management_tab # Crea la nuova tab delle informazioni
|
29 |
|
30 |
# Crea dizionario completo dei riferimenti
|
31 |
dropdowns = {
|
32 |
"document": doc_refs,
|
33 |
"chat": chat_refs,
|
34 |
-
"info": info_refs
|
|
|
35 |
}
|
36 |
|
37 |
# Crea i tab nell'ordine corretto
|
38 |
chat_refs # Tab 4: Chatbot (ultima tab)
|
39 |
doc_refs # Tab 2: Document Management
|
40 |
db_refs(dropdowns)
|
41 |
-
|
42 |
info_refs # Tab 5: Info (ultima tab)
|
43 |
|
44 |
rag_chatbot.launch()
|
|
|
8 |
from ui.db_management_tab import create_db_management_tab
|
9 |
from ui.document_management_tab import create_document_management_tab
|
10 |
from ui.info_tab import create_info_tab # Importa la nuova tab
|
11 |
+
from ui.chunks_viewer_tab import create_chunks_viewer_tab # Aggiungi l'import in cima al file
|
12 |
|
13 |
# Configura il logging
|
14 |
configure_logging()
|
|
|
25 |
info_refs = create_info_tab()
|
26 |
chat_refs = create_chatbot_tab()
|
27 |
doc_refs = create_document_management_tab()
|
28 |
+
chunks_refs = create_chunks_viewer_tab() # Aggiungi il nuovo tab
|
29 |
db_refs = create_db_management_tab # Crea la nuova tab delle informazioni
|
30 |
|
31 |
# Crea dizionario completo dei riferimenti
|
32 |
dropdowns = {
|
33 |
"document": doc_refs,
|
34 |
"chat": chat_refs,
|
35 |
+
"info": info_refs,
|
36 |
+
"chunks": chunks_refs # Aggiungi il riferimento
|
37 |
}
|
38 |
|
39 |
# Crea i tab nell'ordine corretto
|
40 |
chat_refs # Tab 4: Chatbot (ultima tab)
|
41 |
doc_refs # Tab 2: Document Management
|
42 |
db_refs(dropdowns)
|
43 |
+
chunks_refs # Aggiungi il tab dei chunks
|
44 |
info_refs # Tab 5: Info (ultima tab)
|
45 |
|
46 |
rag_chatbot.launch()
|
app/document_handling.py
CHANGED
@@ -12,13 +12,44 @@ from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_doc
|
|
12 |
|
13 |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def upload_and_index(files, title, author, db_name="default_db"):
|
16 |
if not files:
|
17 |
-
return "Nessun file caricato."
|
18 |
|
19 |
documents = []
|
20 |
doc_metadata = []
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
for file in files:
|
23 |
try:
|
24 |
if file.name.endswith('.pdf'):
|
@@ -26,13 +57,16 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
26 |
elif file.name.endswith('.docx'):
|
27 |
text = extract_text_from_docx(file.name)
|
28 |
else:
|
29 |
-
# File .txt o altro testo semplice
|
30 |
with open(file.name, 'r', encoding='utf-8') as f:
|
31 |
text = f.read()
|
32 |
|
33 |
chunks = create_chunks(text)
|
34 |
|
35 |
-
#
|
|
|
|
|
|
|
|
|
36 |
doc_meta = DocumentMetadata(
|
37 |
filename=os.path.basename(file.name),
|
38 |
title=title,
|
@@ -40,49 +74,52 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
40 |
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
41 |
chunks=len(chunks)
|
42 |
)
|
|
|
43 |
|
44 |
-
# Metadata per ogni chunk
|
45 |
for i, chunk in enumerate(chunks):
|
46 |
chunk_metadata = {
|
47 |
"content": chunk,
|
48 |
"source": os.path.basename(file.name),
|
49 |
"title": title,
|
50 |
"author": author,
|
51 |
-
"chunk_index": i,
|
52 |
"total_chunks": len(chunks),
|
53 |
"upload_date": doc_meta.upload_date
|
54 |
}
|
55 |
documents.append(chunk_metadata)
|
56 |
|
57 |
-
doc_metadata.append(doc_meta)
|
58 |
-
|
59 |
except Exception as e:
|
60 |
logging.error(f"Errore durante la lettura del file {file.name}: {e}")
|
61 |
continue
|
62 |
|
63 |
if documents:
|
64 |
try:
|
65 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
66 |
-
os.makedirs(db_path, exist_ok=True)
|
67 |
-
|
68 |
-
# Usa la funzione centralizzata invece dell'inizializzazione diretta
|
69 |
-
embeddings = get_embeddings()
|
70 |
texts = [doc["content"] for doc in documents]
|
71 |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
|
72 |
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
75 |
vectorstore.save_local(db_path)
|
76 |
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
return f"Documenti indicizzati con successo nel database '{db_name}'!"
|
81 |
except Exception as e:
|
82 |
-
|
83 |
-
|
|
|
84 |
|
85 |
-
return "Nessun documento processato."
|
86 |
|
87 |
def list_indexed_files(db_name="default_db"):
|
88 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
|
|
12 |
|
13 |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
|
14 |
|
15 |
+
def merge_metadata(existing_metadata, new_metadata, db_name):
|
16 |
+
"""Unisce i metadati esistenti con i nuovi."""
|
17 |
+
metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
|
18 |
+
|
19 |
+
if os.path.exists(metadata_path):
|
20 |
+
with open(metadata_path, 'r') as f:
|
21 |
+
existing_metadata = json.load(f)
|
22 |
+
else:
|
23 |
+
existing_metadata = []
|
24 |
+
|
25 |
+
# Converte i nuovi metadati in dizionari
|
26 |
+
new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata]
|
27 |
+
existing_metadata.extend(new_metadata_dicts)
|
28 |
+
|
29 |
+
return existing_metadata
|
30 |
+
|
31 |
def upload_and_index(files, title, author, db_name="default_db"):
|
32 |
if not files:
|
33 |
+
return False, "Nessun file caricato.", ""
|
34 |
|
35 |
documents = []
|
36 |
doc_metadata = []
|
37 |
|
38 |
+
# Crea directory del database se non esiste
|
39 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
40 |
+
os.makedirs(db_path, exist_ok=True)
|
41 |
+
|
42 |
+
embeddings = get_embeddings()
|
43 |
+
existing_vectorstore = None
|
44 |
+
|
45 |
+
try:
|
46 |
+
if os.path.exists(os.path.join(db_path, "index.faiss")):
|
47 |
+
existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
48 |
+
except Exception as e:
|
49 |
+
logging.error(f"Errore caricamento vectorstore esistente: {e}")
|
50 |
+
existing_vectorstore = None
|
51 |
+
|
52 |
+
# Processa i nuovi file
|
53 |
for file in files:
|
54 |
try:
|
55 |
if file.name.endswith('.pdf'):
|
|
|
57 |
elif file.name.endswith('.docx'):
|
58 |
text = extract_text_from_docx(file.name)
|
59 |
else:
|
|
|
60 |
with open(file.name, 'r', encoding='utf-8') as f:
|
61 |
text = f.read()
|
62 |
|
63 |
chunks = create_chunks(text)
|
64 |
|
65 |
+
# Calcola l'offset per i nuovi chunks
|
66 |
+
chunk_offset = 0
|
67 |
+
if existing_vectorstore:
|
68 |
+
chunk_offset = len(existing_vectorstore.docstore._dict)
|
69 |
+
|
70 |
doc_meta = DocumentMetadata(
|
71 |
filename=os.path.basename(file.name),
|
72 |
title=title,
|
|
|
74 |
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
75 |
chunks=len(chunks)
|
76 |
)
|
77 |
+
doc_metadata.append(doc_meta)
|
78 |
|
|
|
79 |
for i, chunk in enumerate(chunks):
|
80 |
chunk_metadata = {
|
81 |
"content": chunk,
|
82 |
"source": os.path.basename(file.name),
|
83 |
"title": title,
|
84 |
"author": author,
|
85 |
+
"chunk_index": chunk_offset + i,
|
86 |
"total_chunks": len(chunks),
|
87 |
"upload_date": doc_meta.upload_date
|
88 |
}
|
89 |
documents.append(chunk_metadata)
|
90 |
|
|
|
|
|
91 |
except Exception as e:
|
92 |
logging.error(f"Errore durante la lettura del file {file.name}: {e}")
|
93 |
continue
|
94 |
|
95 |
if documents:
|
96 |
try:
|
|
|
|
|
|
|
|
|
|
|
97 |
texts = [doc["content"] for doc in documents]
|
98 |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
|
99 |
|
100 |
+
if existing_vectorstore:
|
101 |
+
existing_vectorstore.add_texts(texts, metadatas=metadatas)
|
102 |
+
vectorstore = existing_vectorstore
|
103 |
+
else:
|
104 |
+
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
|
105 |
+
|
106 |
vectorstore.save_local(db_path)
|
107 |
|
108 |
+
final_metadata = merge_metadata([], doc_metadata, db_name)
|
109 |
+
|
110 |
+
# Salva i metadati
|
111 |
+
metadata_path = os.path.join(db_path, "metadata.json")
|
112 |
+
with open(metadata_path, 'w') as f:
|
113 |
+
json.dump(final_metadata, f, indent=2)
|
114 |
+
|
115 |
+
return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
|
116 |
|
|
|
117 |
except Exception as e:
|
118 |
+
error_msg = f"Errore durante l'indicizzazione: {e}"
|
119 |
+
logging.error(error_msg)
|
120 |
+
return False, error_msg, ""
|
121 |
|
122 |
+
return False, "Nessun documento processato.", ""
|
123 |
|
124 |
def list_indexed_files(db_name="default_db"):
|
125 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
app/llm_handling.py
CHANGED
@@ -5,9 +5,14 @@ from langchain_community.vectorstores import FAISS
|
|
5 |
import requests
|
6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
7 |
import json
|
|
|
8 |
|
9 |
-
from app.config import
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
from app.configs.prompts import SYSTEM_PROMPTS
|
12 |
from app.utils.embedding_utils import get_embeddings
|
13 |
|
@@ -47,20 +52,21 @@ def read_metadata(db_path):
|
|
47 |
return json.load(f)
|
48 |
return []
|
49 |
|
50 |
-
def get_relevant_documents(vectorstore, question
|
51 |
"""Retrieves relevant documents from the vectorstore"""
|
52 |
try:
|
53 |
enhanced_query = enhance_query(question)
|
54 |
docs_and_scores = vectorstore.similarity_search_with_score(
|
55 |
enhanced_query,
|
56 |
-
k=
|
57 |
)
|
58 |
filtered_docs = [
|
59 |
-
doc for doc, score in docs_and_scores
|
|
|
60 |
]
|
61 |
logging.info(f"Query: {question}")
|
62 |
logging.info(f"Documents found: {len(filtered_docs)}")
|
63 |
-
return filtered_docs
|
64 |
except Exception as e:
|
65 |
logging.error(f"Error retrieving documents: {e}")
|
66 |
return []
|
@@ -68,8 +74,7 @@ def get_relevant_documents(vectorstore, question, min_similarity=0.7):
|
|
68 |
def enhance_query(question):
|
69 |
stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
|
70 |
words = [w for w in question.lower().split() if w not in stop_words]
|
71 |
-
|
72 |
-
return enhanced_query
|
73 |
|
74 |
def log_search_results(question, docs_and_scores):
|
75 |
logging.info(f"Query: {question}")
|
@@ -78,39 +83,52 @@ def log_search_results(question, docs_and_scores):
|
|
78 |
logging.info(f"Content: {doc.page_content[:100]}...")
|
79 |
|
80 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
81 |
-
def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=
|
82 |
if chat_history is None:
|
83 |
chat_history = []
|
84 |
try:
|
85 |
-
|
86 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
87 |
-
|
88 |
-
metadata_dict = {m["filename"]: m for m in metadata_list}
|
89 |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
90 |
relevant_docs = get_relevant_documents(vectorstore, question)
|
|
|
91 |
if not relevant_docs:
|
92 |
return [
|
93 |
{"role": "user", "content": question},
|
94 |
-
{"role": "assistant", "content": "
|
95 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
sources = []
|
97 |
-
for
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
])
|
106 |
-
sources_text = "\n\
|
107 |
prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
|
108 |
-
prompt += "\
|
|
|
|
|
109 |
messages = [
|
110 |
{"role": "system", "content": prompt},
|
111 |
*[{"role": m["role"], "content": m["content"]} for m in chat_history],
|
112 |
{"role": "user", "content": question}
|
113 |
]
|
|
|
114 |
client, model = get_llm_client(llm_type)
|
115 |
response = client.chat.completions.create(
|
116 |
model=model,
|
@@ -118,14 +136,15 @@ def answer_question(question, db_name, prompt_type="tutor", chat_history=None, l
|
|
118 |
temperature=0.7,
|
119 |
max_tokens=2048
|
120 |
)
|
|
|
121 |
answer = response.choices[0].message.content + sources_text
|
122 |
return [
|
123 |
{"role": "user", "content": question},
|
124 |
{"role": "assistant", "content": answer}
|
125 |
]
|
126 |
except Exception as e:
|
127 |
-
logging.error(f"Error
|
128 |
-
error_msg = "
|
129 |
return [
|
130 |
{"role": "user", "content": question},
|
131 |
{"role": "assistant", "content": f"⚠️ {error_msg}"}
|
@@ -144,11 +163,10 @@ class DocumentRetriever:
|
|
144 |
enhanced_query = enhance_query(question)
|
145 |
docs_and_scores = self.vectorstore.similarity_search_with_score(
|
146 |
enhanced_query,
|
147 |
-
k=
|
148 |
)
|
149 |
log_search_results(question, docs_and_scores)
|
150 |
-
|
151 |
-
# return self._filter_relevant_docs(docs_and_scores)
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
pass
|
|
|
5 |
import requests
|
6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
7 |
import json
|
8 |
+
from collections import defaultdict
|
9 |
|
10 |
+
from app.config import (
|
11 |
+
BASE_DB_PATH,
|
12 |
+
LLM_CONFIGS,
|
13 |
+
LLMType,
|
14 |
+
EMBEDDING_CONFIG
|
15 |
+
)
|
16 |
from app.configs.prompts import SYSTEM_PROMPTS
|
17 |
from app.utils.embedding_utils import get_embeddings
|
18 |
|
|
|
52 |
return json.load(f)
|
53 |
return []
|
54 |
|
55 |
+
def get_relevant_documents(vectorstore, question):
|
56 |
"""Retrieves relevant documents from the vectorstore"""
|
57 |
try:
|
58 |
enhanced_query = enhance_query(question)
|
59 |
docs_and_scores = vectorstore.similarity_search_with_score(
|
60 |
enhanced_query,
|
61 |
+
k=EMBEDDING_CONFIG['k_documents']
|
62 |
)
|
63 |
filtered_docs = [
|
64 |
+
doc for doc, score in docs_and_scores
|
65 |
+
if score >= EMBEDDING_CONFIG['min_similarity']
|
66 |
]
|
67 |
logging.info(f"Query: {question}")
|
68 |
logging.info(f"Documents found: {len(filtered_docs)}")
|
69 |
+
return filtered_docs if filtered_docs else []
|
70 |
except Exception as e:
|
71 |
logging.error(f"Error retrieving documents: {e}")
|
72 |
return []
|
|
|
74 |
def enhance_query(question):
|
75 |
stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
|
76 |
words = [w for w in question.lower().split() if w not in stop_words]
|
77 |
+
return " ".join(words)
|
|
|
78 |
|
79 |
def log_search_results(question, docs_and_scores):
|
80 |
logging.info(f"Query: {question}")
|
|
|
83 |
logging.info(f"Content: {doc.page_content[:100]}...")
|
84 |
|
85 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
86 |
+
def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
|
87 |
if chat_history is None:
|
88 |
chat_history = []
|
89 |
try:
|
90 |
+
# Setup e recupero documenti
|
91 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
92 |
+
embeddings = get_embeddings()
|
|
|
93 |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
94 |
relevant_docs = get_relevant_documents(vectorstore, question)
|
95 |
+
|
96 |
if not relevant_docs:
|
97 |
return [
|
98 |
{"role": "user", "content": question},
|
99 |
+
{"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti."}
|
100 |
]
|
101 |
+
|
102 |
+
# Leggi metadata.json per il totale dei chunks
|
103 |
+
metadata_path = os.path.join("db", f"faiss_index_{db_name}", "metadata.json")
|
104 |
+
with open(metadata_path, 'r') as f:
|
105 |
+
metadata_list = json.load(f)
|
106 |
+
|
107 |
+
# Crea dizionario titolo -> chunks
|
108 |
+
total_chunks = {doc['title']: doc['chunks'] for doc in metadata_list}
|
109 |
+
|
110 |
+
# Prepara le fonti
|
111 |
sources = []
|
112 |
+
for doc in relevant_docs:
|
113 |
+
meta = doc.metadata
|
114 |
+
title = meta.get('title', 'Unknown')
|
115 |
+
chunk_index = meta.get('chunk_index', 0)
|
116 |
+
total_doc_chunks = total_chunks.get(title, 0)
|
117 |
+
sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Chunk {chunk_index+1} di {total_doc_chunks}")
|
118 |
+
|
119 |
+
# Prepara contesto e prompt
|
120 |
+
context = "\n".join([doc.page_content for doc in relevant_docs])
|
121 |
+
sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
|
122 |
prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
|
123 |
+
prompt += "\nCita sempre le fonti utilizzate nella risposta, inclusi titolo e autore."
|
124 |
+
|
125 |
+
# Crea messaggio e ottieni risposta
|
126 |
messages = [
|
127 |
{"role": "system", "content": prompt},
|
128 |
*[{"role": m["role"], "content": m["content"]} for m in chat_history],
|
129 |
{"role": "user", "content": question}
|
130 |
]
|
131 |
+
|
132 |
client, model = get_llm_client(llm_type)
|
133 |
response = client.chat.completions.create(
|
134 |
model=model,
|
|
|
136 |
temperature=0.7,
|
137 |
max_tokens=2048
|
138 |
)
|
139 |
+
|
140 |
answer = response.choices[0].message.content + sources_text
|
141 |
return [
|
142 |
{"role": "user", "content": question},
|
143 |
{"role": "assistant", "content": answer}
|
144 |
]
|
145 |
except Exception as e:
|
146 |
+
logging.error(f"Error in answer_question: {e}")
|
147 |
+
error_msg = "LLM locale non disponibile." if "local" in str(llm_type) else str(e)
|
148 |
return [
|
149 |
{"role": "user", "content": question},
|
150 |
{"role": "assistant", "content": f"⚠️ {error_msg}"}
|
|
|
163 |
enhanced_query = enhance_query(question)
|
164 |
docs_and_scores = self.vectorstore.similarity_search_with_score(
|
165 |
enhanced_query,
|
166 |
+
k=EMBEDDING_CONFIG['k_documents']
|
167 |
)
|
168 |
log_search_results(question, docs_and_scores)
|
169 |
+
return [doc for doc, _ in docs_and_scores]
|
|
|
170 |
|
171 |
if __name__ == "__main__":
|
172 |
pass
|
db/.DS_Store
CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ
|
|
ui/chunks_viewer_tab.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from langchain.vectorstores import FAISS
|
6 |
+
from app.document_handling import get_embeddings
|
7 |
+
from app.config import BASE_DB_PATH
|
8 |
+
from app.utils.database_handling import list_databases
|
9 |
+
|
10 |
+
|
11 |
+
def create_chunks_viewer_tab():
|
12 |
+
"""Crea il tab per visualizzare i chunks dei database."""
|
13 |
+
|
14 |
+
def load_chunks(db_name):
|
15 |
+
"""Carica la lista dei chunks dal database selezionato."""
|
16 |
+
if not db_name:
|
17 |
+
return gr.Dropdown(choices=[], interactive=False), "Seleziona un database"
|
18 |
+
|
19 |
+
try:
|
20 |
+
metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
|
21 |
+
with open(metadata_path, 'r') as f:
|
22 |
+
metadata = json.load(f)
|
23 |
+
|
24 |
+
# Crea lista di chunks con formato "Chunk X - Titolo"
|
25 |
+
chunk_list = []
|
26 |
+
for doc in metadata:
|
27 |
+
for i in range(doc['chunks']):
|
28 |
+
chunk_list.append(f"Chunk {i+1} - {doc['title']}")
|
29 |
+
|
30 |
+
return gr.Dropdown(choices=chunk_list, interactive=True), ""
|
31 |
+
except Exception as e:
|
32 |
+
logging.error(f"Errore nel caricamento chunks: {e}")
|
33 |
+
return gr.Dropdown(choices=[], interactive=False), f"Errore: {e}"
|
34 |
+
|
35 |
+
def inspect_chunk(db_name, chunk_id):
|
36 |
+
"""Recupera il contenuto del chunk selezionato."""
|
37 |
+
if not db_name or not chunk_id:
|
38 |
+
return "Seleziona un database e un chunk"
|
39 |
+
|
40 |
+
try:
|
41 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
42 |
+
embeddings = get_embeddings()
|
43 |
+
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
44 |
+
|
45 |
+
# Estrai il numero del chunk dal formato "Chunk X - Titolo"
|
46 |
+
chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", "")) - 1
|
47 |
+
|
48 |
+
# Verifica che l'indice sia valido
|
49 |
+
doc_ids = list(vectorstore.docstore._dict.keys())
|
50 |
+
if chunk_num >= len(doc_ids):
|
51 |
+
return f"Errore: chunk {chunk_num + 1} non trovato nel database"
|
52 |
+
|
53 |
+
chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
|
54 |
+
return chunk_content
|
55 |
+
|
56 |
+
except Exception as e:
|
57 |
+
logging.error(f"Errore nell'ispezione del chunk: {e}")
|
58 |
+
return f"Errore nel recupero del contenuto: {e}"
|
59 |
+
|
60 |
+
with gr.Tab("Visualizza Chunks"):
|
61 |
+
gr.Markdown("## Ispeziona Chunks dei Database")
|
62 |
+
|
63 |
+
with gr.Row():
|
64 |
+
with gr.Column():
|
65 |
+
# Selettori
|
66 |
+
db_selector = gr.Dropdown(
|
67 |
+
choices=list_databases(),
|
68 |
+
label="Seleziona Database",
|
69 |
+
value=list_databases()[0] if list_databases() else None
|
70 |
+
)
|
71 |
+
chunk_selector = gr.Dropdown(
|
72 |
+
choices=[],
|
73 |
+
label="Seleziona Chunk",
|
74 |
+
interactive=False
|
75 |
+
)
|
76 |
+
inspect_button = gr.Button("Visualizza Contenuto")
|
77 |
+
|
78 |
+
with gr.Column():
|
79 |
+
# Area visualizzazione contenuto
|
80 |
+
chunk_content = gr.TextArea(
|
81 |
+
label="Contenuto del Chunk",
|
82 |
+
interactive=False,
|
83 |
+
lines=20
|
84 |
+
)
|
85 |
+
error_box = gr.Textbox(
|
86 |
+
label="Status",
|
87 |
+
visible=True,
|
88 |
+
interactive=False
|
89 |
+
)
|
90 |
+
|
91 |
+
# Eventi
|
92 |
+
db_selector.change(
|
93 |
+
fn=load_chunks,
|
94 |
+
inputs=[db_selector],
|
95 |
+
outputs=[chunk_selector, error_box]
|
96 |
+
)
|
97 |
+
|
98 |
+
inspect_button.click(
|
99 |
+
fn=inspect_chunk,
|
100 |
+
inputs=[db_selector, chunk_selector],
|
101 |
+
outputs=[chunk_content]
|
102 |
+
)
|
103 |
+
|
104 |
+
return {"db_selector": db_selector}
|
ui/db_management_tab.py
CHANGED
@@ -9,6 +9,8 @@ def create_db_management_tab(dropdowns):
|
|
9 |
# Aggiorna tutti i dropdown dell'applicazione (5 invece di 4)
|
10 |
return [gr.update(choices=updated_dbs) for _ in range(5)]
|
11 |
|
|
|
|
|
12 |
with gr.Tab("Gestione Database"):
|
13 |
gr.Markdown("## Operazioni sui Database")
|
14 |
|
@@ -86,4 +88,3 @@ def create_db_management_tab(dropdowns):
|
|
86 |
|
87 |
# Ritorna i componenti che vogliamo poter aggiornare/agganciare
|
88 |
return [modify_db_old_name, delete_db_dropdown, create_db_button, modify_db_button, delete_db_button]
|
89 |
-
|
|
|
9 |
# Aggiorna tutti i dropdown dell'applicazione (5 invece di 4)
|
10 |
return [gr.update(choices=updated_dbs) for _ in range(5)]
|
11 |
|
12 |
+
|
13 |
+
|
14 |
with gr.Tab("Gestione Database"):
|
15 |
gr.Markdown("## Operazioni sui Database")
|
16 |
|
|
|
88 |
|
89 |
# Ritorna i componenti che vogliamo poter aggiornare/agganciare
|
90 |
return [modify_db_old_name, delete_db_dropdown, create_db_button, modify_db_button, delete_db_button]
|
|
ui/document_management_tab.py
CHANGED
@@ -16,14 +16,16 @@ def create_document_management_tab():
|
|
16 |
]
|
17 |
|
18 |
def upload_and_index_callback(files, title, author, db_name):
|
19 |
-
"""Carica e indicizza i documenti, quindi aggiorna la lista dei file."""
|
20 |
try:
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
except Exception as e:
|
25 |
-
|
26 |
-
|
|
|
27 |
|
28 |
def list_files_callback(db_name):
|
29 |
"""Elenca i file indicizzati nel database conoscenze specificato."""
|
|
|
16 |
]
|
17 |
|
18 |
def upload_and_index_callback(files, title, author, db_name):
|
|
|
19 |
try:
|
20 |
+
success, message, details = upload_and_index(files, title, author, db_name)
|
21 |
+
if success:
|
22 |
+
return message, list_databases(), list_databases()
|
23 |
+
else:
|
24 |
+
return message, list_databases(), list_databases()
|
25 |
except Exception as e:
|
26 |
+
error_msg = f"Errore durante l'upload: {e}"
|
27 |
+
logging.error(error_msg)
|
28 |
+
return error_msg, list_databases(), list_databases()
|
29 |
|
30 |
def list_files_callback(db_name):
|
31 |
"""Elenca i file indicizzati nel database conoscenze specificato."""
|