Spaces:

Nugh75
/

Edurag_beta

Sleeping

File size: 8,518 Bytes

import logging
from langchain_community.vectorstores import FAISS
import os
import json
from datetime import datetime
from app.utils.database_handling import BASE_DB_PATH
from langchain_community.embeddings import HuggingFaceEmbeddings
from app.utils.embedding_utils import *
from app.utils.dataclass_utils import DocumentMetadata, save_metadata
from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx


# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

def merge_metadata(existing_metadata, new_metadata, db_name):
    """Unisce i metadati esistenti con i nuovi."""
    metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
    
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            existing_metadata = json.load(f)
    else:
        existing_metadata = []
    
    # Converte i nuovi metadati in dizionari
    new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata]
    existing_metadata.extend(new_metadata_dicts)
    
    return existing_metadata

def upload_and_index(files, title, author, db_name="default_db"):
    if not files:
        return False, "Nessun file caricato.", ""
        
    documents = []
    doc_metadata = []
    
    # Crea directory del database se non esiste
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
    os.makedirs(db_path, exist_ok=True)
    
    embeddings = get_embeddings()
    existing_vectorstore = None
    current_chunk_offset = 0
    
    try:
        # Calcola l'ultimo ID chunk utilizzato
        last_chunk_id = 0
        if os.path.exists(os.path.join(db_path, "metadata.json")):
            with open(os.path.join(db_path, "metadata.json"), 'r') as f:
                existing_metadata = json.load(f)
                last_chunk_id = sum(doc['chunks'] for doc in existing_metadata)
    
        if os.path.exists(os.path.join(db_path, "index.faiss")):
            existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    except Exception as e:
        logging.error(f"Errore caricamento vectorstore esistente: {e}")
        existing_vectorstore = None
        last_chunk_id = 0
    
    # Processa i nuovi file
    for file in files:
        try:
            if file.name.endswith('.pdf'):
                text = extract_text_from_pdf(file.name)
            elif file.name.endswith('.docx'):
                text = extract_text_from_docx(file.name)
            else:
                with open(file.name, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
            chunks = create_chunks(text)
            
            doc_meta = DocumentMetadata(
                filename=os.path.basename(file.name),
                title=title,
                author=author,
                upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                chunks=len(chunks)
            )
            doc_metadata.append(doc_meta)
            
            # Aggiungi metadati a ogni chunk
            for i, chunk in enumerate(chunks):
                chunk_id = last_chunk_id + i
                chunk_metadata = {
                    "content": chunk,
                    "source": os.path.basename(file.name),
                    "title": title,
                    "author": author,
                    "chunk_id": chunk_id,  # ID univoco del chunk
                    "doc_chunk_index": i,  # Indice del chunk nel documento
                    "total_doc_chunks": len(chunks),
                    "filename": os.path.basename(file.name)  # Aggiunto per riferimento
                }
                documents.append(chunk_metadata)
            
            last_chunk_id += len(chunks)
            
        except Exception as e:
            logging.error(f"Errore durante la lettura del file {file.name}: {e}")
            continue

    if documents:
        try:
            texts = [doc["content"] for doc in documents]
            metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
            
            if existing_vectorstore:
                existing_vectorstore.add_texts(texts, metadatas=metadatas)
                vectorstore = existing_vectorstore
            else:
                vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
            
            vectorstore.save_local(db_path)
            
            # Aggiorna metadata.json
            final_metadata = merge_metadata([], doc_metadata, db_name)
            with open(os.path.join(db_path, "metadata.json"), 'w') as f:
                json.dump(final_metadata, f, indent=2)
            
            return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
            
        except Exception as e:
            error_msg = f"Errore durante l'indicizzazione: {e}"
            logging.error(error_msg)
            return False, error_msg, ""
    
    return False, "Nessun documento processato.", ""

def list_indexed_files(db_name="default_db"):
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
    metadata_file = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(metadata_file):
        return "Nessun file nel database."
    
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        if not metadata:
            return "Nessun documento nel database."
        
        output = []
        for doc in metadata:
            output.append(
                f"📄 {doc['title']}\n"
                f"   Autore: {doc['author']}\n"
                f"   File: {doc['filename']}\n"
                f"   Chunks: {doc['chunks']}\n"
                f"   Caricato il: {doc['upload_date']}\n"
            )
        
        return "\n".join(output) if output else "Nessun documento nel database."
    except Exception as e:
        logging.error(f"Errore nella lettura dei metadati: {e}")
        return f"Errore nella lettura dei metadati: {e}"

def delete_file_from_database(file_name, db_name="default_db"):
    """Elimina un file e i suoi chunks dal database."""
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
    metadata_path = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(metadata_path):
        return "Database non trovato (metadata.json mancante)."
    
    try:
        # Carica i metadati esistenti
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        
        # Trova il file da eliminare
        file_index = next((i for i, doc in enumerate(metadata) 
                          if doc['filename'] == file_name), -1)
        
        if file_index == -1:
            return f"File '{file_name}' non trovato nel database."
            
        # Carica il vectorstore esistente
        embeddings = get_embeddings()
        vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
        
        # Calcola l'intervallo di chunks da rimuovere
        chunks_before = sum(doc['chunks'] for doc in metadata[:file_index])
        chunks_to_remove = metadata[file_index]['chunks']
        
        # Estrai tutti i documenti tranne quelli da rimuovere
        all_docs = list(vectorstore.docstore._dict.items())
        docs_to_keep = (
            all_docs[:chunks_before] + 
            all_docs[chunks_before + chunks_to_remove:]
        )
        
        # Rimuovi il file dai metadati
        metadata.pop(file_index)
        
        # Ricrea il vectorstore da zero
        if docs_to_keep:
            texts = [doc[1].page_content for doc in docs_to_keep]
            metadatas = [doc[1].metadata for doc in docs_to_keep]
            new_vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
            new_vectorstore.save_local(db_path)
        else:
            # Se non ci sono più documenti, rimuovi il vectorstore
            os.remove(os.path.join(db_path, "index.faiss"))
            os.remove(os.path.join(db_path, "index.pkl"))
        
        # Salva i metadati aggiornati
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
            
        return f"File '{file_name}' eliminato con successo."
        
    except Exception as e:
        logging.error(f"Errore durante l'eliminazione: {e}")
        return f"Errore durante l'eliminazione: {e}"