Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Edurag_beta / app /document_handling.py

Nugh75

fonti aggiustato, almeno spero

07697cb 6 months ago

raw

history blame contribute delete

8.52 kB

	import logging
	from langchain_community.vectorstores import FAISS
	import os
	import json
	from datetime import datetime
	from app.utils.database_handling import BASE_DB_PATH
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from app.utils.embedding_utils import *
	from app.utils.dataclass_utils import DocumentMetadata, save_metadata
	from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx


	# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

	def merge_metadata(existing_metadata, new_metadata, db_name):
	"""Unisce i metadati esistenti con i nuovi."""
	metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")

	if os.path.exists(metadata_path):
	with open(metadata_path, 'r') as f:
	existing_metadata = json.load(f)
	else:
	existing_metadata = []

	# Converte i nuovi metadati in dizionari
	new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata]
	existing_metadata.extend(new_metadata_dicts)

	return existing_metadata

	def upload_and_index(files, title, author, db_name="default_db"):
	if not files:
	return False, "Nessun file caricato.", ""

	documents = []
	doc_metadata = []

	# Crea directory del database se non esiste
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
	os.makedirs(db_path, exist_ok=True)

	embeddings = get_embeddings()
	existing_vectorstore = None
	current_chunk_offset = 0

	try:
	# Calcola l'ultimo ID chunk utilizzato
	last_chunk_id = 0
	if os.path.exists(os.path.join(db_path, "metadata.json")):
	with open(os.path.join(db_path, "metadata.json"), 'r') as f:
	existing_metadata = json.load(f)
	last_chunk_id = sum(doc['chunks'] for doc in existing_metadata)

	if os.path.exists(os.path.join(db_path, "index.faiss")):
	existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
	except Exception as e:
	logging.error(f"Errore caricamento vectorstore esistente: {e}")
	existing_vectorstore = None
	last_chunk_id = 0

	# Processa i nuovi file
	for file in files:
	try:
	if file.name.endswith('.pdf'):
	text = extract_text_from_pdf(file.name)
	elif file.name.endswith('.docx'):
	text = extract_text_from_docx(file.name)
	else:
	with open(file.name, 'r', encoding='utf-8') as f:
	text = f.read()

	chunks = create_chunks(text)

	doc_meta = DocumentMetadata(
	filename=os.path.basename(file.name),
	title=title,
	author=author,
	upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	chunks=len(chunks)
	)
	doc_metadata.append(doc_meta)

	# Aggiungi metadati a ogni chunk
	for i, chunk in enumerate(chunks):
	chunk_id = last_chunk_id + i
	chunk_metadata = {
	"content": chunk,
	"source": os.path.basename(file.name),
	"title": title,
	"author": author,
	"chunk_id": chunk_id, # ID univoco del chunk
	"doc_chunk_index": i, # Indice del chunk nel documento
	"total_doc_chunks": len(chunks),
	"filename": os.path.basename(file.name) # Aggiunto per riferimento
	}
	documents.append(chunk_metadata)

	last_chunk_id += len(chunks)

	except Exception as e:
	logging.error(f"Errore durante la lettura del file {file.name}: {e}")
	continue

	if documents:
	try:
	texts = [doc["content"] for doc in documents]
	metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]

	if existing_vectorstore:
	existing_vectorstore.add_texts(texts, metadatas=metadatas)
	vectorstore = existing_vectorstore
	else:
	vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

	vectorstore.save_local(db_path)

	# Aggiorna metadata.json
	final_metadata = merge_metadata([], doc_metadata, db_name)
	with open(os.path.join(db_path, "metadata.json"), 'w') as f:
	json.dump(final_metadata, f, indent=2)

	return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"

	except Exception as e:
	error_msg = f"Errore durante l'indicizzazione: {e}"
	logging.error(error_msg)
	return False, error_msg, ""

	return False, "Nessun documento processato.", ""

	def list_indexed_files(db_name="default_db"):
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
	metadata_file = os.path.join(db_path, "metadata.json")

	if not os.path.exists(metadata_file):
	return "Nessun file nel database."

	try:
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)

	if not metadata:
	return "Nessun documento nel database."

	output = []
	for doc in metadata:
	output.append(
	f"📄 {doc['title']}\n"
	f" Autore: {doc['author']}\n"
	f" File: {doc['filename']}\n"
	f" Chunks: {doc['chunks']}\n"
	f" Caricato il: {doc['upload_date']}\n"
	)

	return "\n".join(output) if output else "Nessun documento nel database."
	except Exception as e:
	logging.error(f"Errore nella lettura dei metadati: {e}")
	return f"Errore nella lettura dei metadati: {e}"

	def delete_file_from_database(file_name, db_name="default_db"):
	"""Elimina un file e i suoi chunks dal database."""
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
	metadata_path = os.path.join(db_path, "metadata.json")

	if not os.path.exists(metadata_path):
	return "Database non trovato (metadata.json mancante)."

	try:
	# Carica i metadati esistenti
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)

	# Trova il file da eliminare
	file_index = next((i for i, doc in enumerate(metadata)
	if doc['filename'] == file_name), -1)

	if file_index == -1:
	return f"File '{file_name}' non trovato nel database."

	# Carica il vectorstore esistente
	embeddings = get_embeddings()
	vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)

	# Calcola l'intervallo di chunks da rimuovere
	chunks_before = sum(doc['chunks'] for doc in metadata[:file_index])
	chunks_to_remove = metadata[file_index]['chunks']

	# Estrai tutti i documenti tranne quelli da rimuovere
	all_docs = list(vectorstore.docstore._dict.items())
	docs_to_keep = (
	all_docs[:chunks_before] +
	all_docs[chunks_before + chunks_to_remove:]
	)

	# Rimuovi il file dai metadati
	metadata.pop(file_index)

	# Ricrea il vectorstore da zero
	if docs_to_keep:
	texts = [doc[1].page_content for doc in docs_to_keep]
	metadatas = [doc[1].metadata for doc in docs_to_keep]
	new_vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
	new_vectorstore.save_local(db_path)
	else:
	# Se non ci sono più documenti, rimuovi il vectorstore
	os.remove(os.path.join(db_path, "index.faiss"))
	os.remove(os.path.join(db_path, "index.pkl"))

	# Salva i metadati aggiornati
	with open(metadata_path, 'w') as f:
	json.dump(metadata, f, indent=2)

	return f"File '{file_name}' eliminato con successo."

	except Exception as e:
	logging.error(f"Errore durante l'eliminazione: {e}")
	return f"Errore durante l'eliminazione: {e}"