Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Edurag_beta / app /document_handling.py

Nugh75

Creazione function e database_hundling

b2638ec 6 months ago

raw

history blame

9.67 kB

	import logging
	import gradio as gr # Aggiunto import mancante
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	import os
	import shutil
	import PyPDF2
	from docx import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from dataclasses import dataclass
	import json
	from datetime import datetime

	# Initialize the text splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

	# -------------- UTILITY FUNCTIONS --------------
	@dataclass
	class DocumentMetadata:
	filename: str
	title: str
	author: str
	upload_date: str
	chunks: int

	def to_dict(self):
	return {
	"filename": self.filename,
	"title": self.title,
	"author": self.author,
	"upload_date": self.upload_date,
	"chunks": self.chunks
	}

	def save_metadata(metadata_list, db_name):
	db_path = f"faiss_index_{db_name}"
	metadata_file = os.path.join(db_path, "metadata.json")

	existing_metadata = []
	if os.path.exists(metadata_file):
	with open(metadata_file, 'r') as f:
	existing_metadata = json.load(f)

	existing_metadata.extend([m.to_dict() for m in metadata_list])

	with open(metadata_file, 'w') as f:
	json.dump(existing_metadata, f, indent=2)

	def extract_text_from_pdf(file_path):
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def extract_text_from_docx(file_path):
	doc = Document(file_path)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	# -------------- CHATBOT TAB FUNCTIONS --------------
	def answer_question(question, db_name="default_db"):
	db_path = f"faiss_index_{db_name}"
	if not os.path.exists(db_path):
	logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
	return "Database non trovato."

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)

	# Perform a similarity search
	docs = vectorstore.similarity_search(question)

	if not docs:
	return "Nessun documento corrispondente alla query."

	# Collect the document contents
	results = [doc.page_content for doc in docs]
	return "\n\n".join(results)

	# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
	def upload_and_index(files, title, author, db_name="default_db"):
	if not files:
	return "Nessun file caricato."

	documents = []
	doc_metadata = []

	for file in files:
	try:
	if file.name.endswith('.pdf'):
	text = extract_text_from_pdf(file.name)
	elif file.name.endswith('.docx'):
	text = extract_text_from_docx(file.name)
	else:
	# File .txt o altro testo semplice
	with open(file.name, 'r', encoding='utf-8') as f:
	text = f.read()

	chunks = text_splitter.split_text(text)

	# Metadata per il documento
	doc_meta = DocumentMetadata(
	filename=os.path.basename(file.name),
	title=title,
	author=author,
	upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	chunks=len(chunks)
	)

	# Metadata per ogni chunk
	for i, chunk in enumerate(chunks):
	chunk_metadata = {
	"content": chunk,
	"source": os.path.basename(file.name),
	"title": title,
	"author": author,
	"chunk_index": i,
	"total_chunks": len(chunks),
	"upload_date": doc_meta.upload_date
	}
	documents.append(chunk_metadata)

	doc_metadata.append(doc_meta)

	except Exception as e:
	logging.error(f"Errore durante la lettura del file {file.name}: {e}")
	continue

	if documents:
	try:
	db_path = f"faiss_index_{db_name}"
	os.makedirs(db_path, exist_ok=True)

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	texts = [doc["content"] for doc in documents]
	metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]

	# Crea o sovrascrivi l'indice FAISS con questi documenti
	vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
	vectorstore.save_local(db_path)

	# Salva i metadati del documento su file
	save_metadata(doc_metadata, db_name)

	return f"Documenti indicizzati con successo nel database '{db_name}'!"
	except Exception as e:
	logging.error(f"Errore durante l'indicizzazione: {e}")
	return f"Errore durante l'indicizzazione: {e}"

	return "Nessun documento processato."

	def list_indexed_files(db_name="default_db"):
	db_path = f"faiss_index_{db_name}"
	metadata_file = os.path.join(db_path, "metadata.json")

	if not os.path.exists(metadata_file):
	return "Nessun file nel database."

	try:
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)

	if not metadata:
	return "Nessun documento nel database."

	output = []
	for doc in metadata:
	output.append(
	f"📄 {doc['title']}\n"
	f" Autore: {doc['author']}\n"
	f" File: {doc['filename']}\n"
	f" Chunks: {doc['chunks']}\n"
	f" Caricato il: {doc['upload_date']}\n"
	)

	return "\n".join(output) if output else "Nessun documento nel database."
	except Exception as e:
	logging.error(f"Errore nella lettura dei metadati: {e}")
	return f"Errore nella lettura dei metadati: {e}"

	def delete_file_from_database(file_name, db_name="default_db"):
	"""
	Esempio semplificato: potresti voler rimuovere i chunk
	da FAISS. Attualmente, la funzione gestisce un 'file_list.txt',
	ma devi adattarla alle tue esigenze di rimozione dei chunk.
	"""
	db_path = f"faiss_index_{db_name}"
	file_list_path = os.path.join(db_path, "file_list.txt")

	if not os.path.exists(file_list_path):
	return "Database non trovato (file_list.txt mancante)."

	try:
	# Leggi la lista dei file
	with open(file_list_path, "r") as f:
	files = f.readlines()

	# Rimuovi il file dalla lista
	files = [line.strip() for line in files if line.strip() != file_name]

	# Riscrivi la lista aggiornata
	with open(file_list_path, "w") as f:
	for fl in files:
	f.write(f"{fl}\n")

	return f"File '{file_name}' rimosso dal database '{db_name}'."
	except Exception as e:
	return f"Errore durante la rimozione del file: {e}"

	# -------------- DOCUMENT VISUALIZATION TAB FUNCTIONS --------------
	def list_indexed_documents(db_name="default_db"):
	db_path = f"faiss_index_{db_name}"
	metadata_file = os.path.join(db_path, "metadata.json")

	if not os.path.exists(db_path):
	return f"Il database '{db_name}' non esiste."

	if not os.path.exists(metadata_file):
	return f"Nessun documento nel database '{db_name}'."

	try:
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)

	if not metadata:
	return "Nessun documento trovato nel database."

	output_lines = ["📚 Documenti nel database:"]
	for doc in metadata:
	output_lines.extend([
	f"\n📄 Documento: {doc['title']}",
	f" 📝 Autore: {doc['author']}",
	f" 📁 File: {doc['filename']}",
	f" 🕒 Caricato il: {doc['upload_date']}",
	f" 📑 Chunks: {doc['chunks']}"
	])

	result = "\n".join(output_lines)
	logging.info(f"Documenti trovati nel database {db_name}: {result}")
	return result

	except Exception as e:
	error_msg = f"Errore nella lettura dei metadati: {e}"
	logging.error(error_msg)
	return error_msg

	# -------------- NEW FEATURES TAB FUNCTIONS --------------
	def search_documents(query, db_name="default_db"):
	db_path = f"faiss_index_{db_name}"
	if not os.path.exists(db_path):
	logging.warning(f"L'indice FAISS per il database '{db_name}' non esiste.")
	return "Database non trovato."

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)

	# Perform a similarity search
	docs = vectorstore.similarity_search(query)

	if not docs:
	return "Nessun documento corrispondente alla query."

	# Collect the document contents
	results = [doc.page_content for doc in docs]
	return "\n\n".join(results)

	def generate_summary(db_name="default_db"):
	# Placeholder per la logica di summarization
	return "This is a summary of the documents in the database."