Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Edurag_beta / app /document_handling.py

Nugh75

moduliazione del programma

080146c 7 months ago

raw

history blame

6.84 kB

	import logging
	import gradio as gr
	from langchain_community.vectorstores import FAISS
	import os
	import PyPDF2
	from docx import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import json
	from datetime import datetime
	from app.functions.database_handling import BASE_DB_PATH
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
	from app.utils.embedding_utils import get_embeddings
	from app.utils.dataclass_utils import DocumentMetadata, save_metadata


	# -------------- UTILITY FUNCTIONS --------------

	def extract_text_from_pdf(file_path):
	"""
	Estrae il testo da un file PDF.

	Args:
	file_path: Percorso del file PDF

	Returns:
	str: Testo estratto dal PDF
	"""
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def extract_text_from_docx(file_path):
	"""
	Estrae il testo da un file DOCX.

	Args:
	file_path: Percorso del file DOCX

	Returns:
	str: Testo estratto dal documento Word
	"""
	doc = Document(file_path)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	def create_chunks(text):
	from app.config import EMBEDDING_CONFIG
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=EMBEDDING_CONFIG["chunk_size"],
	chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)
	return text_splitter.split_text(text)


	def create_vectorstore(texts, metadatas, db_path):
	embeddings = get_embeddings()
	db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)




	# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

	def upload_and_index(files, title, author, db_name="default_db"):
	if not files:
	return "Nessun file caricato."

	documents = []
	doc_metadata = []

	for file in files:
	try:
	if file.name.endswith('.pdf'):
	text = extract_text_from_pdf(file.name)
	elif file.name.endswith('.docx'):
	text = extract_text_from_docx(file.name)
	else:
	# File .txt o altro testo semplice
	with open(file.name, 'r', encoding='utf-8') as f:
	text = f.read()

	chunks = create_chunks(text)

	# Metadata per il documento
	doc_meta = DocumentMetadata(
	filename=os.path.basename(file.name),
	title=title,
	author=author,
	upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	chunks=len(chunks)
	)

	# Metadata per ogni chunk
	for i, chunk in enumerate(chunks):
	chunk_metadata = {
	"content": chunk,
	"source": os.path.basename(file.name),
	"title": title,
	"author": author,
	"chunk_index": i,
	"total_chunks": len(chunks),
	"upload_date": doc_meta.upload_date
	}
	documents.append(chunk_metadata)

	doc_metadata.append(doc_meta)

	except Exception as e:
	logging.error(f"Errore durante la lettura del file {file.name}: {e}")
	continue

	if documents:
	try:
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
	os.makedirs(db_path, exist_ok=True)

	# Usa la funzione centralizzata invece dell'inizializzazione diretta
	embeddings = get_embeddings()
	texts = [doc["content"] for doc in documents]
	metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]

	# Crea o sovrascrivi l'indice FAISS con questi documenti
	vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
	vectorstore.save_local(db_path)

	# Salva i metadati del documento su file
	save_metadata(doc_metadata, db_name)

	return f"Documenti indicizzati con successo nel database '{db_name}'!"
	except Exception as e:
	logging.error(f"Errore durante l'indicizzazione: {e}")
	return f"Errore durante l'indicizzazione: {e}"

	return "Nessun documento processato."

	def list_indexed_files(db_name="default_db"):
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
	metadata_file = os.path.join(db_path, "metadata.json")

	if not os.path.exists(metadata_file):
	return "Nessun file nel database."

	try:
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)

	if not metadata:
	return "Nessun documento nel database."

	output = []
	for doc in metadata:
	output.append(
	f"📄 {doc['title']}\n"
	f" Autore: {doc['author']}\n"
	f" File: {doc['filename']}\n"
	f" Chunks: {doc['chunks']}\n"
	f" Caricato il: {doc['upload_date']}\n"
	)

	return "\n".join(output) if output else "Nessun documento nel database."
	except Exception as e:
	logging.error(f"Errore nella lettura dei metadati: {e}")
	return f"Errore nella lettura dei metadati: {e}"

	def delete_file_from_database(file_name, db_name="default_db"):
	"""
	Esempio semplificato: potresti voler rimuovere i chunk
	da FAISS. Attualmente, la funzione gestisce un 'file_list.txt',
	ma devi adattarla alle tue esigenze di rimozione dei chunk.
	"""
	db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
	file_list_path = os.path.join(db_path, "file_list.txt")

	if not os.path.exists(file_list_path):
	return "Database non trovato (file_list.txt mancante)."

	try:
	# Leggi la lista dei file
	with open(file_list_path, "r") as f:
	files = f.readlines()

	# Rimuovi il file dalla lista
	files = [line.strip() for line in files if line.strip() != file_name]

	# Riscrivi la lista aggiornata
	with open(file_list_path, "w") as f:
	for fl in files:
	f.write(f"{fl}\n")

	return f"File '{file_name}' rimosso dal database '{db_name}'."
	except Exception as e:
	return f"Errore durante la rimozione del file: {e}"