import logging from langchain_community.vectorstores import FAISS import os import json from datetime import datetime from app.utils.database_handling import BASE_DB_PATH from langchain_community.embeddings import HuggingFaceEmbeddings from app.utils.embedding_utils import * from app.utils.dataclass_utils import DocumentMetadata, save_metadata from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- def merge_metadata(existing_metadata, new_metadata, db_name): """Unisce i metadati esistenti con i nuovi.""" metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r') as f: existing_metadata = json.load(f) else: existing_metadata = [] # Converte i nuovi metadati in dizionari new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata] existing_metadata.extend(new_metadata_dicts) return existing_metadata def upload_and_index(files, title, author, db_name="default_db"): if not files: return False, "Nessun file caricato.", "" documents = [] doc_metadata = [] # Crea directory del database se non esiste db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") os.makedirs(db_path, exist_ok=True) embeddings = get_embeddings() existing_vectorstore = None try: if os.path.exists(os.path.join(db_path, "index.faiss")): existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) except Exception as e: logging.error(f"Errore caricamento vectorstore esistente: {e}") existing_vectorstore = None # Processa i nuovi file for file in files: try: if file.name.endswith('.pdf'): text = extract_text_from_pdf(file.name) elif file.name.endswith('.docx'): text = extract_text_from_docx(file.name) else: with open(file.name, 'r', encoding='utf-8') as f: text = f.read() chunks = create_chunks(text) # Calcola l'offset per i nuovi chunks chunk_offset = 0 if existing_vectorstore: chunk_offset = len(existing_vectorstore.docstore._dict) doc_meta = DocumentMetadata( filename=os.path.basename(file.name), title=title, author=author, upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), chunks=len(chunks) ) doc_metadata.append(doc_meta) for i, chunk in enumerate(chunks): chunk_metadata = { "content": chunk, "source": os.path.basename(file.name), "title": title, "author": author, "chunk_index": chunk_offset + i, "total_chunks": len(chunks), "upload_date": doc_meta.upload_date } documents.append(chunk_metadata) except Exception as e: logging.error(f"Errore durante la lettura del file {file.name}: {e}") continue if documents: try: texts = [doc["content"] for doc in documents] metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] if existing_vectorstore: existing_vectorstore.add_texts(texts, metadatas=metadatas) vectorstore = existing_vectorstore else: vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) vectorstore.save_local(db_path) final_metadata = merge_metadata([], doc_metadata, db_name) # Salva i metadati metadata_path = os.path.join(db_path, "metadata.json") with open(metadata_path, 'w') as f: json.dump(final_metadata, f, indent=2) return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato" except Exception as e: error_msg = f"Errore durante l'indicizzazione: {e}" logging.error(error_msg) return False, error_msg, "" return False, "Nessun documento processato.", "" def list_indexed_files(db_name="default_db"): db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui metadata_file = os.path.join(db_path, "metadata.json") if not os.path.exists(metadata_file): return "Nessun file nel database." try: with open(metadata_file, 'r') as f: metadata = json.load(f) if not metadata: return "Nessun documento nel database." output = [] for doc in metadata: output.append( f"📄 {doc['title']}\n" f" Autore: {doc['author']}\n" f" File: {doc['filename']}\n" f" Chunks: {doc['chunks']}\n" f" Caricato il: {doc['upload_date']}\n" ) return "\n".join(output) if output else "Nessun documento nel database." except Exception as e: logging.error(f"Errore nella lettura dei metadati: {e}") return f"Errore nella lettura dei metadati: {e}" def delete_file_from_database(file_name, db_name="default_db"): """ Esempio semplificato: potresti voler rimuovere i chunk da FAISS. Attualmente, la funzione gestisce un 'file_list.txt', ma devi adattarla alle tue esigenze di rimozione dei chunk. """ db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui file_list_path = os.path.join(db_path, "file_list.txt") if not os.path.exists(file_list_path): return "Database non trovato (file_list.txt mancante)." try: # Leggi la lista dei file with open(file_list_path, "r") as f: files = f.readlines() # Rimuovi il file dalla lista files = [line.strip() for line in files if line.strip() != file_name] # Riscrivi la lista aggiornata with open(file_list_path, "w") as f: for fl in files: f.write(f"{fl}\n") return f"File '{file_name}' rimosso dal database '{db_name}'." except Exception as e: return f"Errore durante la rimozione del file: {e}"