import logging import gradio as gr # Aggiunto import mancante from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings import os import shutil import PyPDF2 from docx import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from dataclasses import dataclass import json from datetime import datetime # Initialize the text splitter text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) # -------------- UTILITY FUNCTIONS -------------- @dataclass class DocumentMetadata: filename: str title: str author: str upload_date: str chunks: int def to_dict(self): return { "filename": self.filename, "title": self.title, "author": self.author, "upload_date": self.upload_date, "chunks": self.chunks } def save_metadata(metadata_list, db_name): db_path = f"faiss_index_{db_name}" metadata_file = os.path.join(db_path, "metadata.json") existing_metadata = [] if os.path.exists(metadata_file): with open(metadata_file, 'r') as f: existing_metadata = json.load(f) existing_metadata.extend([m.to_dict() for m in metadata_list]) with open(metadata_file, 'w') as f: json.dump(existing_metadata, f, indent=2) def extract_text_from_pdf(file_path): with open(file_path, 'rb') as f: reader = PyPDF2.PdfReader(f) text = "" for page in reader.pages: text += page.extract_text() return text def extract_text_from_docx(file_path): doc = Document(file_path) text = "" for para in doc.paragraphs: text += para.text + "\n" return text # -------------- CHATBOT TAB FUNCTIONS -------------- def answer_question(question, db_name="default_db"): db_path = f"faiss_index_{db_name}" if not os.path.exists(db_path): logging.warning(f"L'indice FAISS per il database {db_name} non esiste.") return "Database non trovato." embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) # Perform a similarity search docs = vectorstore.similarity_search(question) if not docs: return "Nessun documento corrispondente alla query." # Collect the document contents results = [doc.page_content for doc in docs] return "\n\n".join(results) # -------------- DATABASE MANAGEMENT TAB FUNCTIONS -------------- def create_database(db_name): logging.info(f"Creating database: {db_name}") db_path = f"faiss_index_{db_name}" if os.path.exists(db_path): return f"Il database {db_name} esiste già." try: os.makedirs(db_path) logging.info(f"Database {db_name} created successfully.") databases = list_databases() return (f"Database {db_name} creato con successo.", databases) except Exception as e: logging.error(f"Errore nella creazione del database: {e}") return (f"Errore nella creazione del database: {e}", []) def delete_database(db_name): db_path = f"faiss_index_{db_name}" if not os.path.exists(db_path): return f"Il database {db_name} non esiste." try: shutil.rmtree(db_path) logging.info(f"Database {db_name} eliminato con successo.") return f"Database {db_name} eliminato con successo." except OSError as e: logging.error(f"Impossibile eliminare il database {db_name}: {e}") return f"Impossibile eliminare il database {db_name}: {e}" def modify_database(old_db_name, new_db_name): old_db_path = f"faiss_index_{old_db_name}" new_db_path = f"faiss_index_{new_db_name}" if not os.path.exists(old_db_path): return f"Il database {old_db_name} non esiste." if os.path.exists(new_db_path): return f"Il database {new_db_name} esiste già." try: os.rename(old_db_path, new_db_path) return f"Database {old_db_name} rinominato in {new_db_name} con successo." except Exception as e: return f"Errore durante la modifica del database: {e}" def list_databases(): try: databases = [] for item in os.listdir(): if os.path.isdir(item) and item.startswith("faiss_index_"): db_name = item.replace("faiss_index_", "") databases.append(db_name) # Ensure "default_db" is in the list if "default_db" not in databases: databases.append("default_db") return databases except Exception as e: logging.error(f"Error listing databases: {e}") return [] # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- def upload_and_index(files, title, author, db_name="default_db"): if not files: return "Nessun file caricato." documents = [] doc_metadata = [] for file in files: try: if file.name.endswith('.pdf'): text = extract_text_from_pdf(file.name) elif file.name.endswith('.docx'): text = extract_text_from_docx(file.name) else: with open(file.name, 'r', encoding='utf-8') as f: text = f.read() chunks = text_splitter.split_text(text) # Metadata per il documento doc_meta = DocumentMetadata( filename=os.path.basename(file.name), title=title, author=author, upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), chunks=len(chunks) ) # Metadata per ogni chunk for i, chunk in enumerate(chunks): chunk_metadata = { "content": chunk, "source": os.path.basename(file.name), "title": title, "author": author, "chunk_index": i, "total_chunks": len(chunks), "upload_date": doc_meta.upload_date } documents.append(chunk_metadata) doc_metadata.append(doc_meta) except Exception as e: logging.error(f"Errore durante la lettura del file {file.name}: {e}") continue if documents: try: db_path = f"faiss_index_{db_name}" os.makedirs(db_path, exist_ok=True) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") texts = [doc["content"] for doc in documents] metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) vectorstore.save_local(db_path) # Salva i metadati del documento save_metadata(doc_metadata, db_name) return f"Documenti indicizzati con successo nel database {db_name}!" except Exception as e: logging.error(f"Errore durante l'indicizzazione: {e}") return f"Errore durante l'indicizzazione: {e}" return "Nessun documento processato." def list_indexed_files(db_name="default_db"): db_path = f"faiss_index_{db_name}" metadata_file = os.path.join(db_path, "metadata.json") if not os.path.exists(metadata_file): return "Nessun file nel database." try: with open(metadata_file, 'r') as f: metadata = json.load(f) output = [] for doc in metadata: output.append( f"📄 {doc['title']}\n" f" Autore: {doc['author']}\n" f" File: {doc['filename']}\n" f" Chunks: {doc['chunks']}\n" f" Caricato il: {doc['upload_date']}\n" ) return "\n".join(output) if output else "Nessun documento nel database." except Exception as e: logging.error(f"Errore nella lettura dei metadati: {e}") return f"Errore nella lettura dei metadati: {e}" def delete_file_from_database(file_name, db_name="default_db"): db_path = f"faiss_index_{db_name}" file_list_path = os.path.join(db_path, "file_list.txt") if not os.path.exists(file_list_path): return "Database non trovato." try: # Leggi la lista dei file with open(file_list_path, "r") as f: files = f.readlines() # Rimuovi il file dalla lista files = [f.strip() for f in files if f.strip() != file_name] # Riscrivi la lista aggiornata with open(file_list_path, "w") as f: for file in files: f.write(f"{file}\n") return f"File {file_name} rimosso dal database {db_name}." except Exception as e: return f"Errore durante la rimozione del file: {e}" # -------------- DOCUMENT VISUALIZATION TAB FUNCTIONS -------------- def list_indexed_documents(db_name="default_db"): db_path = f"faiss_index_{db_name}" metadata_file = os.path.join(db_path, "metadata.json") if not os.path.exists(db_path): return f"Il database {db_name} non esiste." if not os.path.exists(metadata_file): return f"Nessun documento nel database {db_name}." try: with open(metadata_file, 'r') as f: metadata = json.load(f) if not metadata: return "Nessun documento trovato nel database." output_lines = ["📚 Documenti nel database:"] for doc in metadata: output_lines.extend([ f"\n📄 Documento: {doc['title']}", f" 📝 Autore: {doc['author']}", f" 📁 File: {doc['filename']}", f" 🕒 Caricato il: {doc['upload_date']}", f" 📑 Chunks: {doc['chunks']}" ]) result = "\n".join(output_lines) logging.info(f"Documenti trovati nel database {db_name}: {result}") return result except Exception as e: error_msg = f"Errore nella lettura dei metadati: {e}" logging.error(error_msg) return error_msg # -------------- NEW FEATURES TAB FUNCTIONS -------------- def search_documents(query, db_name="default_db"): db_path = f"faiss_index_{db_name}" if not os.path.exists(db_path): logging.warning(f"L'indice FAISS per il database {db_name} non esiste.") return "Database non trovato." embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) # Perform a similarity search docs = vectorstore.similarity_search(query) if not docs: return "Nessun documento corrispondente alla query." # Collect the document contents results = [doc.page_content for doc in docs] return "\n\n".join(results) def generate_summary(db_name="default_db"): # Placeholder for summarization logic return "This is a summary of the documents in the database."