Spaces:
Sleeping
Sleeping
fonti aggiustato, almeno spero
Browse files- app/document_handling.py +69 -32
- app/llm_handling.py +10 -2
- db/.DS_Store +0 -0
- ui/chunks_viewer_tab.py +17 -8
app/document_handling.py
CHANGED
@@ -41,13 +41,22 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
41 |
|
42 |
embeddings = get_embeddings()
|
43 |
existing_vectorstore = None
|
|
|
44 |
|
45 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
if os.path.exists(os.path.join(db_path, "index.faiss")):
|
47 |
existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
48 |
except Exception as e:
|
49 |
logging.error(f"Errore caricamento vectorstore esistente: {e}")
|
50 |
existing_vectorstore = None
|
|
|
51 |
|
52 |
# Processa i nuovi file
|
53 |
for file in files:
|
@@ -62,11 +71,6 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
62 |
|
63 |
chunks = create_chunks(text)
|
64 |
|
65 |
-
# Calcola l'offset per i nuovi chunks
|
66 |
-
chunk_offset = 0
|
67 |
-
if existing_vectorstore:
|
68 |
-
chunk_offset = len(existing_vectorstore.docstore._dict)
|
69 |
-
|
70 |
doc_meta = DocumentMetadata(
|
71 |
filename=os.path.basename(file.name),
|
72 |
title=title,
|
@@ -76,18 +80,23 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
76 |
)
|
77 |
doc_metadata.append(doc_meta)
|
78 |
|
|
|
79 |
for i, chunk in enumerate(chunks):
|
|
|
80 |
chunk_metadata = {
|
81 |
"content": chunk,
|
82 |
"source": os.path.basename(file.name),
|
83 |
"title": title,
|
84 |
"author": author,
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
|
|
88 |
}
|
89 |
documents.append(chunk_metadata)
|
90 |
|
|
|
|
|
91 |
except Exception as e:
|
92 |
logging.error(f"Errore durante la lettura del file {file.name}: {e}")
|
93 |
continue
|
@@ -105,11 +114,9 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
105 |
|
106 |
vectorstore.save_local(db_path)
|
107 |
|
|
|
108 |
final_metadata = merge_metadata([], doc_metadata, db_name)
|
109 |
-
|
110 |
-
# Salva i metadati
|
111 |
-
metadata_path = os.path.join(db_path, "metadata.json")
|
112 |
-
with open(metadata_path, 'w') as f:
|
113 |
json.dump(final_metadata, f, indent=2)
|
114 |
|
115 |
return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
|
@@ -151,32 +158,62 @@ def list_indexed_files(db_name="default_db"):
|
|
151 |
return f"Errore nella lettura dei metadati: {e}"
|
152 |
|
153 |
def delete_file_from_database(file_name, db_name="default_db"):
|
154 |
-
"""
|
155 |
-
|
156 |
-
|
157 |
-
ma devi adattarla alle tue esigenze di rimozione dei chunk.
|
158 |
-
"""
|
159 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
160 |
-
file_list_path = os.path.join(db_path, "file_list.txt")
|
161 |
|
162 |
-
if not os.path.exists(
|
163 |
-
return "Database non trovato (
|
164 |
|
165 |
try:
|
166 |
-
#
|
167 |
-
with open(
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
-
#
|
171 |
-
|
|
|
172 |
|
173 |
-
#
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
return f"File '{file_name}' rimosso dal database '{db_name}'."
|
179 |
except Exception as e:
|
180 |
-
|
|
|
181 |
|
182 |
|
|
|
41 |
|
42 |
embeddings = get_embeddings()
|
43 |
existing_vectorstore = None
|
44 |
+
current_chunk_offset = 0
|
45 |
|
46 |
try:
|
47 |
+
# Calcola l'ultimo ID chunk utilizzato
|
48 |
+
last_chunk_id = 0
|
49 |
+
if os.path.exists(os.path.join(db_path, "metadata.json")):
|
50 |
+
with open(os.path.join(db_path, "metadata.json"), 'r') as f:
|
51 |
+
existing_metadata = json.load(f)
|
52 |
+
last_chunk_id = sum(doc['chunks'] for doc in existing_metadata)
|
53 |
+
|
54 |
if os.path.exists(os.path.join(db_path, "index.faiss")):
|
55 |
existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
56 |
except Exception as e:
|
57 |
logging.error(f"Errore caricamento vectorstore esistente: {e}")
|
58 |
existing_vectorstore = None
|
59 |
+
last_chunk_id = 0
|
60 |
|
61 |
# Processa i nuovi file
|
62 |
for file in files:
|
|
|
71 |
|
72 |
chunks = create_chunks(text)
|
73 |
|
|
|
|
|
|
|
|
|
|
|
74 |
doc_meta = DocumentMetadata(
|
75 |
filename=os.path.basename(file.name),
|
76 |
title=title,
|
|
|
80 |
)
|
81 |
doc_metadata.append(doc_meta)
|
82 |
|
83 |
+
# Aggiungi metadati a ogni chunk
|
84 |
for i, chunk in enumerate(chunks):
|
85 |
+
chunk_id = last_chunk_id + i
|
86 |
chunk_metadata = {
|
87 |
"content": chunk,
|
88 |
"source": os.path.basename(file.name),
|
89 |
"title": title,
|
90 |
"author": author,
|
91 |
+
"chunk_id": chunk_id, # ID univoco del chunk
|
92 |
+
"doc_chunk_index": i, # Indice del chunk nel documento
|
93 |
+
"total_doc_chunks": len(chunks),
|
94 |
+
"filename": os.path.basename(file.name) # Aggiunto per riferimento
|
95 |
}
|
96 |
documents.append(chunk_metadata)
|
97 |
|
98 |
+
last_chunk_id += len(chunks)
|
99 |
+
|
100 |
except Exception as e:
|
101 |
logging.error(f"Errore durante la lettura del file {file.name}: {e}")
|
102 |
continue
|
|
|
114 |
|
115 |
vectorstore.save_local(db_path)
|
116 |
|
117 |
+
# Aggiorna metadata.json
|
118 |
final_metadata = merge_metadata([], doc_metadata, db_name)
|
119 |
+
with open(os.path.join(db_path, "metadata.json"), 'w') as f:
|
|
|
|
|
|
|
120 |
json.dump(final_metadata, f, indent=2)
|
121 |
|
122 |
return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
|
|
|
158 |
return f"Errore nella lettura dei metadati: {e}"
|
159 |
|
160 |
def delete_file_from_database(file_name, db_name="default_db"):
|
161 |
+
"""Elimina un file e i suoi chunks dal database."""
|
162 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
163 |
+
metadata_path = os.path.join(db_path, "metadata.json")
|
|
|
|
|
|
|
|
|
164 |
|
165 |
+
if not os.path.exists(metadata_path):
|
166 |
+
return "Database non trovato (metadata.json mancante)."
|
167 |
|
168 |
try:
|
169 |
+
# Carica i metadati esistenti
|
170 |
+
with open(metadata_path, 'r') as f:
|
171 |
+
metadata = json.load(f)
|
172 |
+
|
173 |
+
# Trova il file da eliminare
|
174 |
+
file_index = next((i for i, doc in enumerate(metadata)
|
175 |
+
if doc['filename'] == file_name), -1)
|
176 |
+
|
177 |
+
if file_index == -1:
|
178 |
+
return f"File '{file_name}' non trovato nel database."
|
179 |
+
|
180 |
+
# Carica il vectorstore esistente
|
181 |
+
embeddings = get_embeddings()
|
182 |
+
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
183 |
|
184 |
+
# Calcola l'intervallo di chunks da rimuovere
|
185 |
+
chunks_before = sum(doc['chunks'] for doc in metadata[:file_index])
|
186 |
+
chunks_to_remove = metadata[file_index]['chunks']
|
187 |
|
188 |
+
# Estrai tutti i documenti tranne quelli da rimuovere
|
189 |
+
all_docs = list(vectorstore.docstore._dict.items())
|
190 |
+
docs_to_keep = (
|
191 |
+
all_docs[:chunks_before] +
|
192 |
+
all_docs[chunks_before + chunks_to_remove:]
|
193 |
+
)
|
194 |
+
|
195 |
+
# Rimuovi il file dai metadati
|
196 |
+
metadata.pop(file_index)
|
197 |
+
|
198 |
+
# Ricrea il vectorstore da zero
|
199 |
+
if docs_to_keep:
|
200 |
+
texts = [doc[1].page_content for doc in docs_to_keep]
|
201 |
+
metadatas = [doc[1].metadata for doc in docs_to_keep]
|
202 |
+
new_vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
|
203 |
+
new_vectorstore.save_local(db_path)
|
204 |
+
else:
|
205 |
+
# Se non ci sono più documenti, rimuovi il vectorstore
|
206 |
+
os.remove(os.path.join(db_path, "index.faiss"))
|
207 |
+
os.remove(os.path.join(db_path, "index.pkl"))
|
208 |
+
|
209 |
+
# Salva i metadati aggiornati
|
210 |
+
with open(metadata_path, 'w') as f:
|
211 |
+
json.dump(metadata, f, indent=2)
|
212 |
+
|
213 |
+
return f"File '{file_name}' eliminato con successo."
|
214 |
|
|
|
215 |
except Exception as e:
|
216 |
+
logging.error(f"Errore durante l'eliminazione: {e}")
|
217 |
+
return f"Errore durante l'eliminazione: {e}"
|
218 |
|
219 |
|
app/llm_handling.py
CHANGED
@@ -112,9 +112,17 @@ def answer_question(question, db_name, prompt_type="tutor", chat_history=None, l
|
|
112 |
for doc in relevant_docs:
|
113 |
meta = doc.metadata
|
114 |
title = meta.get('title', 'Unknown')
|
115 |
-
|
|
|
|
|
116 |
total_doc_chunks = total_chunks.get(title, 0)
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# Prepara contesto e prompt
|
120 |
context = "\n".join([doc.page_content for doc in relevant_docs])
|
|
|
112 |
for doc in relevant_docs:
|
113 |
meta = doc.metadata
|
114 |
title = meta.get('title', 'Unknown')
|
115 |
+
author = meta.get('author', 'Unknown')
|
116 |
+
filename = meta.get('filename', 'Unknown')
|
117 |
+
chunk_id = meta.get('chunk_id', 0) # Usa l'ID univoco del chunk
|
118 |
total_doc_chunks = total_chunks.get(title, 0)
|
119 |
+
|
120 |
+
# Usa lo stesso formato di chunks_viewer_tab.py
|
121 |
+
chunk_info = f"📚 Chunk {chunk_id} - {title} ({filename})"
|
122 |
+
if author != 'Unknown':
|
123 |
+
chunk_info += f" - Author: {author}"
|
124 |
+
|
125 |
+
sources.append(chunk_info)
|
126 |
|
127 |
# Prepara contesto e prompt
|
128 |
context = "\n".join([doc.page_content for doc in relevant_docs])
|
db/.DS_Store
CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ
|
|
ui/chunks_viewer_tab.py
CHANGED
@@ -18,14 +18,26 @@ def create_chunks_viewer_tab():
|
|
18 |
|
19 |
try:
|
20 |
metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
|
|
|
|
|
|
|
21 |
with open(metadata_path, 'r') as f:
|
22 |
metadata = json.load(f)
|
23 |
|
24 |
-
|
|
|
|
|
|
|
25 |
chunk_list = []
|
|
|
|
|
26 |
for doc in metadata:
|
27 |
for i in range(doc['chunks']):
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
|
30 |
return gr.Dropdown(choices=chunk_list, interactive=True), ""
|
31 |
except Exception as e:
|
@@ -42,14 +54,11 @@ def create_chunks_viewer_tab():
|
|
42 |
embeddings = get_embeddings()
|
43 |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
44 |
|
45 |
-
# Estrai il numero del chunk
|
46 |
-
chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", ""))
|
47 |
|
48 |
-
#
|
49 |
doc_ids = list(vectorstore.docstore._dict.keys())
|
50 |
-
if chunk_num >= len(doc_ids):
|
51 |
-
return f"Errore: chunk {chunk_num + 1} non trovato nel database"
|
52 |
-
|
53 |
chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
|
54 |
return chunk_content
|
55 |
|
|
|
18 |
|
19 |
try:
|
20 |
metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
|
21 |
+
vectorstore_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
22 |
+
|
23 |
+
# Carica metadati e vectorstore
|
24 |
with open(metadata_path, 'r') as f:
|
25 |
metadata = json.load(f)
|
26 |
|
27 |
+
embeddings = get_embeddings()
|
28 |
+
vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
|
29 |
+
|
30 |
+
# Crea lista di chunks con formato "Chunk X - Titolo (File)"
|
31 |
chunk_list = []
|
32 |
+
current_index = 0
|
33 |
+
|
34 |
for doc in metadata:
|
35 |
for i in range(doc['chunks']):
|
36 |
+
# Recupera il contenuto del chunk per verifica
|
37 |
+
doc_id = list(vectorstore.docstore._dict.keys())[current_index]
|
38 |
+
chunk_metadata = vectorstore.docstore._dict[doc_id].metadata
|
39 |
+
chunk_list.append(f"Chunk {current_index} - {doc['title']} ({doc['filename']})")
|
40 |
+
current_index += 1
|
41 |
|
42 |
return gr.Dropdown(choices=chunk_list, interactive=True), ""
|
43 |
except Exception as e:
|
|
|
54 |
embeddings = get_embeddings()
|
55 |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
56 |
|
57 |
+
# Estrai il numero del chunk
|
58 |
+
chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", ""))
|
59 |
|
60 |
+
# Recupera il chunk usando l'ID univoco
|
61 |
doc_ids = list(vectorstore.docstore._dict.keys())
|
|
|
|
|
|
|
62 |
chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
|
63 |
return chunk_content
|
64 |
|