Spaces:
Sleeping
Sleeping
import logging | |
from langchain_community.vectorstores import FAISS | |
import os | |
import json | |
from datetime import datetime | |
from app.utils.database_handling import BASE_DB_PATH | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from app.utils.embedding_utils import * | |
from app.utils.dataclass_utils import DocumentMetadata, save_metadata | |
from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx | |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- | |
def merge_metadata(existing_metadata, new_metadata, db_name): | |
"""Unisce i metadati esistenti con i nuovi.""" | |
metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json") | |
if os.path.exists(metadata_path): | |
with open(metadata_path, 'r') as f: | |
existing_metadata = json.load(f) | |
else: | |
existing_metadata = [] | |
# Converte i nuovi metadati in dizionari | |
new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata] | |
existing_metadata.extend(new_metadata_dicts) | |
return existing_metadata | |
def upload_and_index(files, title, author, db_name="default_db"): | |
if not files: | |
return False, "Nessun file caricato.", "" | |
documents = [] | |
doc_metadata = [] | |
# Crea directory del database se non esiste | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") | |
os.makedirs(db_path, exist_ok=True) | |
embeddings = get_embeddings() | |
existing_vectorstore = None | |
try: | |
if os.path.exists(os.path.join(db_path, "index.faiss")): | |
existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) | |
except Exception as e: | |
logging.error(f"Errore caricamento vectorstore esistente: {e}") | |
existing_vectorstore = None | |
# Processa i nuovi file | |
for file in files: | |
try: | |
if file.name.endswith('.pdf'): | |
text = extract_text_from_pdf(file.name) | |
elif file.name.endswith('.docx'): | |
text = extract_text_from_docx(file.name) | |
else: | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
chunks = create_chunks(text) | |
# Calcola l'offset per i nuovi chunks | |
chunk_offset = 0 | |
if existing_vectorstore: | |
chunk_offset = len(existing_vectorstore.docstore._dict) | |
doc_meta = DocumentMetadata( | |
filename=os.path.basename(file.name), | |
title=title, | |
author=author, | |
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
chunks=len(chunks) | |
) | |
doc_metadata.append(doc_meta) | |
for i, chunk in enumerate(chunks): | |
chunk_metadata = { | |
"content": chunk, | |
"source": os.path.basename(file.name), | |
"title": title, | |
"author": author, | |
"chunk_index": chunk_offset + i, | |
"total_chunks": len(chunks), | |
"upload_date": doc_meta.upload_date | |
} | |
documents.append(chunk_metadata) | |
except Exception as e: | |
logging.error(f"Errore durante la lettura del file {file.name}: {e}") | |
continue | |
if documents: | |
try: | |
texts = [doc["content"] for doc in documents] | |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] | |
if existing_vectorstore: | |
existing_vectorstore.add_texts(texts, metadatas=metadatas) | |
vectorstore = existing_vectorstore | |
else: | |
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) | |
vectorstore.save_local(db_path) | |
final_metadata = merge_metadata([], doc_metadata, db_name) | |
# Salva i metadati | |
metadata_path = os.path.join(db_path, "metadata.json") | |
with open(metadata_path, 'w') as f: | |
json.dump(final_metadata, f, indent=2) | |
return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato" | |
except Exception as e: | |
error_msg = f"Errore durante l'indicizzazione: {e}" | |
logging.error(error_msg) | |
return False, error_msg, "" | |
return False, "Nessun documento processato.", "" | |
def list_indexed_files(db_name="default_db"): | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui | |
metadata_file = os.path.join(db_path, "metadata.json") | |
if not os.path.exists(metadata_file): | |
return "Nessun file nel database." | |
try: | |
with open(metadata_file, 'r') as f: | |
metadata = json.load(f) | |
if not metadata: | |
return "Nessun documento nel database." | |
output = [] | |
for doc in metadata: | |
output.append( | |
f"📄 {doc['title']}\n" | |
f" Autore: {doc['author']}\n" | |
f" File: {doc['filename']}\n" | |
f" Chunks: {doc['chunks']}\n" | |
f" Caricato il: {doc['upload_date']}\n" | |
) | |
return "\n".join(output) if output else "Nessun documento nel database." | |
except Exception as e: | |
logging.error(f"Errore nella lettura dei metadati: {e}") | |
return f"Errore nella lettura dei metadati: {e}" | |
def delete_file_from_database(file_name, db_name="default_db"): | |
""" | |
Esempio semplificato: potresti voler rimuovere i chunk | |
da FAISS. Attualmente, la funzione gestisce un 'file_list.txt', | |
ma devi adattarla alle tue esigenze di rimozione dei chunk. | |
""" | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui | |
file_list_path = os.path.join(db_path, "file_list.txt") | |
if not os.path.exists(file_list_path): | |
return "Database non trovato (file_list.txt mancante)." | |
try: | |
# Leggi la lista dei file | |
with open(file_list_path, "r") as f: | |
files = f.readlines() | |
# Rimuovi il file dalla lista | |
files = [line.strip() for line in files if line.strip() != file_name] | |
# Riscrivi la lista aggiornata | |
with open(file_list_path, "w") as f: | |
for fl in files: | |
f.write(f"{fl}\n") | |
return f"File '{file_name}' rimosso dal database '{db_name}'." | |
except Exception as e: | |
return f"Errore durante la rimozione del file: {e}" | |