Spaces:

Nugh75
/

Edurag_beta

Sleeping

File size: 11,423 Bytes

47e4aa2

import logging
import gradio as gr  # Aggiunto import mancante
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil
import PyPDF2
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dataclasses import dataclass
import json
from datetime import datetime

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# -------------- UTILITY FUNCTIONS --------------
@dataclass
class DocumentMetadata:
    filename: str
    title: str
    author: str
    upload_date: str
    chunks: int
    
    def to_dict(self):
        return {
            "filename": self.filename,
            "title": self.title,
            "author": self.author,
            "upload_date": self.upload_date,
            "chunks": self.chunks
        }

def save_metadata(metadata_list, db_name):
    db_path = f"faiss_index_{db_name}"
    metadata_file = os.path.join(db_path, "metadata.json")
    
    existing_metadata = []
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as f:
            existing_metadata = json.load(f)
    
    existing_metadata.extend([m.to_dict() for m in metadata_list])
    
    with open(metadata_file, 'w') as f:
        json.dump(existing_metadata, f, indent=2)

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# -------------- CHATBOT TAB FUNCTIONS --------------
def answer_question(question, db_name="default_db"):
    db_path = f"faiss_index_{db_name}"
    if not os.path.exists(db_path):
        logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
        return "Database non trovato."
    
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    
    # Perform a similarity search
    docs = vectorstore.similarity_search(question)
    
    if not docs:
        return "Nessun documento corrispondente alla query."
    
    # Collect the document contents
    results = [doc.page_content for doc in docs]
    return "\n\n".join(results)

# -------------- DATABASE MANAGEMENT TAB FUNCTIONS --------------
def create_database(db_name):
    logging.info(f"Creating database: {db_name}")
    db_path = f"faiss_index_{db_name}"
    
    if os.path.exists(db_path):
        return f"Il database {db_name} esiste già."
    
    try:
        os.makedirs(db_path)
        logging.info(f"Database {db_name} created successfully.")
        databases = list_databases()
        return (f"Database {db_name} creato con successo.", databases)
    except Exception as e:
        logging.error(f"Errore nella creazione del database: {e}")
        return (f"Errore nella creazione del database: {e}", [])

def delete_database(db_name):
    db_path = f"faiss_index_{db_name}"
    if not os.path.exists(db_path):
        return f"Il database {db_name} non esiste."
    try:
        shutil.rmtree(db_path)
        logging.info(f"Database {db_name} eliminato con successo.")
        return f"Database {db_name} eliminato con successo."
    except OSError as e:
        logging.error(f"Impossibile eliminare il database {db_name}: {e}")
        return f"Impossibile eliminare il database {db_name}: {e}"

def modify_database(old_db_name, new_db_name):
    old_db_path = f"faiss_index_{old_db_name}"
    new_db_path = f"faiss_index_{new_db_name}"
    if not os.path.exists(old_db_path):
        return f"Il database {old_db_name} non esiste."
    if os.path.exists(new_db_path):
        return f"Il database {new_db_name} esiste già."
    try:
        os.rename(old_db_path, new_db_path)
        return f"Database {old_db_name} rinominato in {new_db_name} con successo."
    except Exception as e:
        return f"Errore durante la modifica del database: {e}"

def list_databases():
    try:
        databases = []
        for item in os.listdir():
            if os.path.isdir(item) and item.startswith("faiss_index_"):
                db_name = item.replace("faiss_index_", "")
                databases.append(db_name)
        # Ensure "default_db" is in the list
        if "default_db" not in databases:
            databases.append("default_db")
        return databases
    except Exception as e:
        logging.error(f"Error listing databases: {e}")
        return []

# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
def upload_and_index(files, title, author, db_name="default_db"):
    if not files:
        return "Nessun file caricato."
        
    documents = []
    doc_metadata = []
    
    for file in files:
        try:
            if file.name.endswith('.pdf'):
                text = extract_text_from_pdf(file.name)
            elif file.name.endswith('.docx'):
                text = extract_text_from_docx(file.name)
            else:
                with open(file.name, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
            chunks = text_splitter.split_text(text)
            
            # Metadata per il documento
            doc_meta = DocumentMetadata(
                filename=os.path.basename(file.name),
                title=title,
                author=author,
                upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                chunks=len(chunks)
            )
            
            # Metadata per ogni chunk
            for i, chunk in enumerate(chunks):
                chunk_metadata = {
                    "content": chunk,
                    "source": os.path.basename(file.name),
                    "title": title,
                    "author": author,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "upload_date": doc_meta.upload_date
                }
                documents.append(chunk_metadata)
            
            doc_metadata.append(doc_meta)
            
        except Exception as e:
            logging.error(f"Errore durante la lettura del file {file.name}: {e}")
            continue

    if documents:
        try:
            db_path = f"faiss_index_{db_name}"
            os.makedirs(db_path, exist_ok=True)
            
            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
            texts = [doc["content"] for doc in documents]
            metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
            
            vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
            vectorstore.save_local(db_path)
            
            # Salva i metadati del documento
            save_metadata(doc_metadata, db_name)
            
            return f"Documenti indicizzati con successo nel database {db_name}!"
        except Exception as e:
            logging.error(f"Errore durante l'indicizzazione: {e}")
            return f"Errore durante l'indicizzazione: {e}"
    
    return "Nessun documento processato."

def list_indexed_files(db_name="default_db"):
    db_path = f"faiss_index_{db_name}"
    metadata_file = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(metadata_file):
        return "Nessun file nel database."
    
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        output = []
        for doc in metadata:
            output.append(
                f"📄 {doc['title']}\n"
                f"   Autore: {doc['author']}\n"
                f"   File: {doc['filename']}\n"
                f"   Chunks: {doc['chunks']}\n"
                f"   Caricato il: {doc['upload_date']}\n"
            )
        
        return "\n".join(output) if output else "Nessun documento nel database."
    except Exception as e:
        logging.error(f"Errore nella lettura dei metadati: {e}")
        return f"Errore nella lettura dei metadati: {e}"

def delete_file_from_database(file_name, db_name="default_db"):
    db_path = f"faiss_index_{db_name}"
    file_list_path = os.path.join(db_path, "file_list.txt")
    
    if not os.path.exists(file_list_path):
        return "Database non trovato."
    
    try:
        # Leggi la lista dei file
        with open(file_list_path, "r") as f:
            files = f.readlines()
        
        # Rimuovi il file dalla lista
        files = [f.strip() for f in files if f.strip() != file_name]
        
        # Riscrivi la lista aggiornata
        with open(file_list_path, "w") as f:
            for file in files:
                f.write(f"{file}\n")
        
        return f"File {file_name} rimosso dal database {db_name}."
    except Exception as e:
        return f"Errore durante la rimozione del file: {e}"

# -------------- DOCUMENT VISUALIZATION TAB FUNCTIONS --------------
def list_indexed_documents(db_name="default_db"):
    db_path = f"faiss_index_{db_name}"
    metadata_file = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(db_path):
        return f"Il database {db_name} non esiste."
    
    if not os.path.exists(metadata_file):
        return f"Nessun documento nel database {db_name}."
    
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        if not metadata:
            return "Nessun documento trovato nel database."
        
        output_lines = ["📚 Documenti nel database:"]
        for doc in metadata:
            output_lines.extend([
                f"\n📄 Documento: {doc['title']}",
                f"   📝 Autore: {doc['author']}",
                f"   📁 File: {doc['filename']}",
                f"   🕒 Caricato il: {doc['upload_date']}",
                f"   📑 Chunks: {doc['chunks']}"
            ])
        
        result = "\n".join(output_lines)
        logging.info(f"Documenti trovati nel database {db_name}: {result}")
        return result
        
    except Exception as e:
        error_msg = f"Errore nella lettura dei metadati: {e}"
        logging.error(error_msg)
        return error_msg

# -------------- NEW FEATURES TAB FUNCTIONS --------------
def search_documents(query, db_name="default_db"):
    db_path = f"faiss_index_{db_name}"
    if not os.path.exists(db_path):
        logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
        return "Database non trovato."
    
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    
    # Perform a similarity search
    docs = vectorstore.similarity_search(query)
    
    if not docs:
        return "Nessun documento corrispondente alla query."
    
    # Collect the document contents
    results = [doc.page_content for doc in docs]
    return "\n\n".join(results)

def generate_summary(db_name="default_db"):
    # Placeholder for summarization logic
    return "This is a summary of the documents in the database."