import logging from langchain_community.vectorstores import FAISS import os import json from datetime import datetime from app.utils.database_handling import BASE_DB_PATH from langchain_community.embeddings import HuggingFaceEmbeddings from app.utils.embedding_utils import * from app.utils.dataclass_utils import DocumentMetadata, save_metadata from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- def merge_metadata(existing_metadata, new_metadata, db_name): """Unisce i metadati esistenti con i nuovi.""" metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r') as f: existing_metadata = json.load(f) else: existing_metadata = [] # Converte i nuovi metadati in dizionari new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata] existing_metadata.extend(new_metadata_dicts) return existing_metadata def upload_and_index(files, title, author, db_name="default_db"): if not files: return False, "Nessun file caricato.", "" documents = [] doc_metadata = [] # Crea directory del database se non esiste db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") os.makedirs(db_path, exist_ok=True) embeddings = get_embeddings() existing_vectorstore = None current_chunk_offset = 0 try: # Calcola l'ultimo ID chunk utilizzato last_chunk_id = 0 if os.path.exists(os.path.join(db_path, "metadata.json")): with open(os.path.join(db_path, "metadata.json"), 'r') as f: existing_metadata = json.load(f) last_chunk_id = sum(doc['chunks'] for doc in existing_metadata) if os.path.exists(os.path.join(db_path, "index.faiss")): existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) except Exception as e: logging.error(f"Errore caricamento vectorstore esistente: {e}") existing_vectorstore = None last_chunk_id = 0 # Processa i nuovi file for file in files: try: if file.name.endswith('.pdf'): text = extract_text_from_pdf(file.name) elif file.name.endswith('.docx'): text = extract_text_from_docx(file.name) else: with open(file.name, 'r', encoding='utf-8') as f: text = f.read() chunks = create_chunks(text) doc_meta = DocumentMetadata( filename=os.path.basename(file.name), title=title, author=author, upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), chunks=len(chunks) ) doc_metadata.append(doc_meta) # Aggiungi metadati a ogni chunk for i, chunk in enumerate(chunks): chunk_id = last_chunk_id + i chunk_metadata = { "content": chunk, "source": os.path.basename(file.name), "title": title, "author": author, "chunk_id": chunk_id, # ID univoco del chunk "doc_chunk_index": i, # Indice del chunk nel documento "total_doc_chunks": len(chunks), "filename": os.path.basename(file.name) # Aggiunto per riferimento } documents.append(chunk_metadata) last_chunk_id += len(chunks) except Exception as e: logging.error(f"Errore durante la lettura del file {file.name}: {e}") continue if documents: try: texts = [doc["content"] for doc in documents] metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] if existing_vectorstore: existing_vectorstore.add_texts(texts, metadatas=metadatas) vectorstore = existing_vectorstore else: vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) vectorstore.save_local(db_path) # Aggiorna metadata.json final_metadata = merge_metadata([], doc_metadata, db_name) with open(os.path.join(db_path, "metadata.json"), 'w') as f: json.dump(final_metadata, f, indent=2) return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato" except Exception as e: error_msg = f"Errore durante l'indicizzazione: {e}" logging.error(error_msg) return False, error_msg, "" return False, "Nessun documento processato.", "" def list_indexed_files(db_name="default_db"): db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui metadata_file = os.path.join(db_path, "metadata.json") if not os.path.exists(metadata_file): return "Nessun file nel database." try: with open(metadata_file, 'r') as f: metadata = json.load(f) if not metadata: return "Nessun documento nel database." output = [] for doc in metadata: output.append( f"📄 {doc['title']}\n" f" Autore: {doc['author']}\n" f" File: {doc['filename']}\n" f" Chunks: {doc['chunks']}\n" f" Caricato il: {doc['upload_date']}\n" ) return "\n".join(output) if output else "Nessun documento nel database." except Exception as e: logging.error(f"Errore nella lettura dei metadati: {e}") return f"Errore nella lettura dei metadati: {e}" def delete_file_from_database(file_name, db_name="default_db"): """Elimina un file e i suoi chunks dal database.""" db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") metadata_path = os.path.join(db_path, "metadata.json") if not os.path.exists(metadata_path): return "Database non trovato (metadata.json mancante)." try: # Carica i metadati esistenti with open(metadata_path, 'r') as f: metadata = json.load(f) # Trova il file da eliminare file_index = next((i for i, doc in enumerate(metadata) if doc['filename'] == file_name), -1) if file_index == -1: return f"File '{file_name}' non trovato nel database." # Carica il vectorstore esistente embeddings = get_embeddings() vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) # Calcola l'intervallo di chunks da rimuovere chunks_before = sum(doc['chunks'] for doc in metadata[:file_index]) chunks_to_remove = metadata[file_index]['chunks'] # Estrai tutti i documenti tranne quelli da rimuovere all_docs = list(vectorstore.docstore._dict.items()) docs_to_keep = ( all_docs[:chunks_before] + all_docs[chunks_before + chunks_to_remove:] ) # Rimuovi il file dai metadati metadata.pop(file_index) # Ricrea il vectorstore da zero if docs_to_keep: texts = [doc[1].page_content for doc in docs_to_keep] metadatas = [doc[1].metadata for doc in docs_to_keep] new_vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) new_vectorstore.save_local(db_path) else: # Se non ci sono più documenti, rimuovi il vectorstore os.remove(os.path.join(db_path, "index.faiss")) os.remove(os.path.join(db_path, "index.pkl")) # Salva i metadati aggiornati with open(metadata_path, 'w') as f: json.dump(metadata, f, indent=2) return f"File '{file_name}' eliminato con successo." except Exception as e: logging.error(f"Errore durante l'eliminazione: {e}") return f"Errore durante l'eliminazione: {e}"