Nugh75 commited on
Commit
07697cb
·
1 Parent(s): 9804548

fonti aggiustato, almeno spero

Browse files
app/document_handling.py CHANGED
@@ -41,13 +41,22 @@ def upload_and_index(files, title, author, db_name="default_db"):
41
 
42
  embeddings = get_embeddings()
43
  existing_vectorstore = None
 
44
 
45
  try:
 
 
 
 
 
 
 
46
  if os.path.exists(os.path.join(db_path, "index.faiss")):
47
  existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
48
  except Exception as e:
49
  logging.error(f"Errore caricamento vectorstore esistente: {e}")
50
  existing_vectorstore = None
 
51
 
52
  # Processa i nuovi file
53
  for file in files:
@@ -62,11 +71,6 @@ def upload_and_index(files, title, author, db_name="default_db"):
62
 
63
  chunks = create_chunks(text)
64
 
65
- # Calcola l'offset per i nuovi chunks
66
- chunk_offset = 0
67
- if existing_vectorstore:
68
- chunk_offset = len(existing_vectorstore.docstore._dict)
69
-
70
  doc_meta = DocumentMetadata(
71
  filename=os.path.basename(file.name),
72
  title=title,
@@ -76,18 +80,23 @@ def upload_and_index(files, title, author, db_name="default_db"):
76
  )
77
  doc_metadata.append(doc_meta)
78
 
 
79
  for i, chunk in enumerate(chunks):
 
80
  chunk_metadata = {
81
  "content": chunk,
82
  "source": os.path.basename(file.name),
83
  "title": title,
84
  "author": author,
85
- "chunk_index": chunk_offset + i,
86
- "total_chunks": len(chunks),
87
- "upload_date": doc_meta.upload_date
 
88
  }
89
  documents.append(chunk_metadata)
90
 
 
 
91
  except Exception as e:
92
  logging.error(f"Errore durante la lettura del file {file.name}: {e}")
93
  continue
@@ -105,11 +114,9 @@ def upload_and_index(files, title, author, db_name="default_db"):
105
 
106
  vectorstore.save_local(db_path)
107
 
 
108
  final_metadata = merge_metadata([], doc_metadata, db_name)
109
-
110
- # Salva i metadati
111
- metadata_path = os.path.join(db_path, "metadata.json")
112
- with open(metadata_path, 'w') as f:
113
  json.dump(final_metadata, f, indent=2)
114
 
115
  return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
@@ -151,32 +158,62 @@ def list_indexed_files(db_name="default_db"):
151
  return f"Errore nella lettura dei metadati: {e}"
152
 
153
  def delete_file_from_database(file_name, db_name="default_db"):
154
- """
155
- Esempio semplificato: potresti voler rimuovere i chunk
156
- da FAISS. Attualmente, la funzione gestisce un 'file_list.txt',
157
- ma devi adattarla alle tue esigenze di rimozione dei chunk.
158
- """
159
- db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
160
- file_list_path = os.path.join(db_path, "file_list.txt")
161
 
162
- if not os.path.exists(file_list_path):
163
- return "Database non trovato (file_list.txt mancante)."
164
 
165
  try:
166
- # Leggi la lista dei file
167
- with open(file_list_path, "r") as f:
168
- files = f.readlines()
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- # Rimuovi il file dalla lista
171
- files = [line.strip() for line in files if line.strip() != file_name]
 
172
 
173
- # Riscrivi la lista aggiornata
174
- with open(file_list_path, "w") as f:
175
- for fl in files:
176
- f.write(f"{fl}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- return f"File '{file_name}' rimosso dal database '{db_name}'."
179
  except Exception as e:
180
- return f"Errore durante la rimozione del file: {e}"
 
181
 
182
 
 
41
 
42
  embeddings = get_embeddings()
43
  existing_vectorstore = None
44
+ current_chunk_offset = 0
45
 
46
  try:
47
+ # Calcola l'ultimo ID chunk utilizzato
48
+ last_chunk_id = 0
49
+ if os.path.exists(os.path.join(db_path, "metadata.json")):
50
+ with open(os.path.join(db_path, "metadata.json"), 'r') as f:
51
+ existing_metadata = json.load(f)
52
+ last_chunk_id = sum(doc['chunks'] for doc in existing_metadata)
53
+
54
  if os.path.exists(os.path.join(db_path, "index.faiss")):
55
  existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
56
  except Exception as e:
57
  logging.error(f"Errore caricamento vectorstore esistente: {e}")
58
  existing_vectorstore = None
59
+ last_chunk_id = 0
60
 
61
  # Processa i nuovi file
62
  for file in files:
 
71
 
72
  chunks = create_chunks(text)
73
 
 
 
 
 
 
74
  doc_meta = DocumentMetadata(
75
  filename=os.path.basename(file.name),
76
  title=title,
 
80
  )
81
  doc_metadata.append(doc_meta)
82
 
83
+ # Aggiungi metadati a ogni chunk
84
  for i, chunk in enumerate(chunks):
85
+ chunk_id = last_chunk_id + i
86
  chunk_metadata = {
87
  "content": chunk,
88
  "source": os.path.basename(file.name),
89
  "title": title,
90
  "author": author,
91
+ "chunk_id": chunk_id, # ID univoco del chunk
92
+ "doc_chunk_index": i, # Indice del chunk nel documento
93
+ "total_doc_chunks": len(chunks),
94
+ "filename": os.path.basename(file.name) # Aggiunto per riferimento
95
  }
96
  documents.append(chunk_metadata)
97
 
98
+ last_chunk_id += len(chunks)
99
+
100
  except Exception as e:
101
  logging.error(f"Errore durante la lettura del file {file.name}: {e}")
102
  continue
 
114
 
115
  vectorstore.save_local(db_path)
116
 
117
+ # Aggiorna metadata.json
118
  final_metadata = merge_metadata([], doc_metadata, db_name)
119
+ with open(os.path.join(db_path, "metadata.json"), 'w') as f:
 
 
 
120
  json.dump(final_metadata, f, indent=2)
121
 
122
  return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
 
158
  return f"Errore nella lettura dei metadati: {e}"
159
 
160
  def delete_file_from_database(file_name, db_name="default_db"):
161
+ """Elimina un file e i suoi chunks dal database."""
162
+ db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
163
+ metadata_path = os.path.join(db_path, "metadata.json")
 
 
 
 
164
 
165
+ if not os.path.exists(metadata_path):
166
+ return "Database non trovato (metadata.json mancante)."
167
 
168
  try:
169
+ # Carica i metadati esistenti
170
+ with open(metadata_path, 'r') as f:
171
+ metadata = json.load(f)
172
+
173
+ # Trova il file da eliminare
174
+ file_index = next((i for i, doc in enumerate(metadata)
175
+ if doc['filename'] == file_name), -1)
176
+
177
+ if file_index == -1:
178
+ return f"File '{file_name}' non trovato nel database."
179
+
180
+ # Carica il vectorstore esistente
181
+ embeddings = get_embeddings()
182
+ vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
183
 
184
+ # Calcola l'intervallo di chunks da rimuovere
185
+ chunks_before = sum(doc['chunks'] for doc in metadata[:file_index])
186
+ chunks_to_remove = metadata[file_index]['chunks']
187
 
188
+ # Estrai tutti i documenti tranne quelli da rimuovere
189
+ all_docs = list(vectorstore.docstore._dict.items())
190
+ docs_to_keep = (
191
+ all_docs[:chunks_before] +
192
+ all_docs[chunks_before + chunks_to_remove:]
193
+ )
194
+
195
+ # Rimuovi il file dai metadati
196
+ metadata.pop(file_index)
197
+
198
+ # Ricrea il vectorstore da zero
199
+ if docs_to_keep:
200
+ texts = [doc[1].page_content for doc in docs_to_keep]
201
+ metadatas = [doc[1].metadata for doc in docs_to_keep]
202
+ new_vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
203
+ new_vectorstore.save_local(db_path)
204
+ else:
205
+ # Se non ci sono più documenti, rimuovi il vectorstore
206
+ os.remove(os.path.join(db_path, "index.faiss"))
207
+ os.remove(os.path.join(db_path, "index.pkl"))
208
+
209
+ # Salva i metadati aggiornati
210
+ with open(metadata_path, 'w') as f:
211
+ json.dump(metadata, f, indent=2)
212
+
213
+ return f"File '{file_name}' eliminato con successo."
214
 
 
215
  except Exception as e:
216
+ logging.error(f"Errore durante l'eliminazione: {e}")
217
+ return f"Errore durante l'eliminazione: {e}"
218
 
219
 
app/llm_handling.py CHANGED
@@ -112,9 +112,17 @@ def answer_question(question, db_name, prompt_type="tutor", chat_history=None, l
112
  for doc in relevant_docs:
113
  meta = doc.metadata
114
  title = meta.get('title', 'Unknown')
115
- chunk_index = meta.get('chunk_index', 0)
 
 
116
  total_doc_chunks = total_chunks.get(title, 0)
117
- sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Chunk {chunk_index+1} di {total_doc_chunks}")
 
 
 
 
 
 
118
 
119
  # Prepara contesto e prompt
120
  context = "\n".join([doc.page_content for doc in relevant_docs])
 
112
  for doc in relevant_docs:
113
  meta = doc.metadata
114
  title = meta.get('title', 'Unknown')
115
+ author = meta.get('author', 'Unknown')
116
+ filename = meta.get('filename', 'Unknown')
117
+ chunk_id = meta.get('chunk_id', 0) # Usa l'ID univoco del chunk
118
  total_doc_chunks = total_chunks.get(title, 0)
119
+
120
+ # Usa lo stesso formato di chunks_viewer_tab.py
121
+ chunk_info = f"📚 Chunk {chunk_id} - {title} ({filename})"
122
+ if author != 'Unknown':
123
+ chunk_info += f" - Author: {author}"
124
+
125
+ sources.append(chunk_info)
126
 
127
  # Prepara contesto e prompt
128
  context = "\n".join([doc.page_content for doc in relevant_docs])
db/.DS_Store CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ
 
ui/chunks_viewer_tab.py CHANGED
@@ -18,14 +18,26 @@ def create_chunks_viewer_tab():
18
 
19
  try:
20
  metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
 
 
 
21
  with open(metadata_path, 'r') as f:
22
  metadata = json.load(f)
23
 
24
- # Crea lista di chunks con formato "Chunk X - Titolo"
 
 
 
25
  chunk_list = []
 
 
26
  for doc in metadata:
27
  for i in range(doc['chunks']):
28
- chunk_list.append(f"Chunk {i+1} - {doc['title']}")
 
 
 
 
29
 
30
  return gr.Dropdown(choices=chunk_list, interactive=True), ""
31
  except Exception as e:
@@ -42,14 +54,11 @@ def create_chunks_viewer_tab():
42
  embeddings = get_embeddings()
43
  vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
44
 
45
- # Estrai il numero del chunk dal formato "Chunk X - Titolo"
46
- chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", "")) - 1
47
 
48
- # Verifica che l'indice sia valido
49
  doc_ids = list(vectorstore.docstore._dict.keys())
50
- if chunk_num >= len(doc_ids):
51
- return f"Errore: chunk {chunk_num + 1} non trovato nel database"
52
-
53
  chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
54
  return chunk_content
55
 
 
18
 
19
  try:
20
  metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
21
+ vectorstore_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
22
+
23
+ # Carica metadati e vectorstore
24
  with open(metadata_path, 'r') as f:
25
  metadata = json.load(f)
26
 
27
+ embeddings = get_embeddings()
28
+ vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
29
+
30
+ # Crea lista di chunks con formato "Chunk X - Titolo (File)"
31
  chunk_list = []
32
+ current_index = 0
33
+
34
  for doc in metadata:
35
  for i in range(doc['chunks']):
36
+ # Recupera il contenuto del chunk per verifica
37
+ doc_id = list(vectorstore.docstore._dict.keys())[current_index]
38
+ chunk_metadata = vectorstore.docstore._dict[doc_id].metadata
39
+ chunk_list.append(f"Chunk {current_index} - {doc['title']} ({doc['filename']})")
40
+ current_index += 1
41
 
42
  return gr.Dropdown(choices=chunk_list, interactive=True), ""
43
  except Exception as e:
 
54
  embeddings = get_embeddings()
55
  vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
56
 
57
+ # Estrai il numero del chunk
58
+ chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", ""))
59
 
60
+ # Recupera il chunk usando l'ID univoco
61
  doc_ids = list(vectorstore.docstore._dict.keys())
 
 
 
62
  chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
63
  return chunk_content
64