File size: 6,799 Bytes
47e4aa2
 
 
 
 
3c5ed5b
080146c
3c5ed5b
080146c
3c5ed5b
ac52d7a
 
47e4aa2
080146c
9804548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47e4aa2
 
9804548
47e4aa2
 
 
 
9804548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47e4aa2
 
 
 
 
 
 
 
 
 
080146c
47e4aa2
9804548
 
 
 
 
47e4aa2
 
 
 
 
 
 
9804548
47e4aa2
 
 
 
 
 
 
9804548
47e4aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
9804548
 
 
 
 
 
47e4aa2
 
9804548
 
 
 
 
 
 
 
47e4aa2
 
9804548
 
 
47e4aa2
9804548
47e4aa2
 
a45dfb0
47e4aa2
 
 
 
 
 
 
 
 
b2638ec
 
 
47e4aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2638ec
 
 
 
 
a45dfb0
47e4aa2
 
 
b2638ec
47e4aa2
 
 
 
 
 
 
b2638ec
47e4aa2
 
 
b2638ec
 
47e4aa2
b2638ec
47e4aa2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import logging
from langchain_community.vectorstores import FAISS
import os
import json
from datetime import datetime
from app.utils.database_handling import BASE_DB_PATH
from langchain_community.embeddings import HuggingFaceEmbeddings
from app.utils.embedding_utils import *
from app.utils.dataclass_utils import DocumentMetadata, save_metadata
from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx


# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

def merge_metadata(existing_metadata, new_metadata, db_name):
    """Unisce i metadati esistenti con i nuovi."""
    metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
    
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            existing_metadata = json.load(f)
    else:
        existing_metadata = []
    
    # Converte i nuovi metadati in dizionari
    new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata]
    existing_metadata.extend(new_metadata_dicts)
    
    return existing_metadata

def upload_and_index(files, title, author, db_name="default_db"):
    if not files:
        return False, "Nessun file caricato.", ""
        
    documents = []
    doc_metadata = []
    
    # Crea directory del database se non esiste
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
    os.makedirs(db_path, exist_ok=True)
    
    embeddings = get_embeddings()
    existing_vectorstore = None
    
    try:
        if os.path.exists(os.path.join(db_path, "index.faiss")):
            existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    except Exception as e:
        logging.error(f"Errore caricamento vectorstore esistente: {e}")
        existing_vectorstore = None
    
    # Processa i nuovi file
    for file in files:
        try:
            if file.name.endswith('.pdf'):
                text = extract_text_from_pdf(file.name)
            elif file.name.endswith('.docx'):
                text = extract_text_from_docx(file.name)
            else:
                with open(file.name, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
            chunks = create_chunks(text)
            
            # Calcola l'offset per i nuovi chunks
            chunk_offset = 0
            if existing_vectorstore:
                chunk_offset = len(existing_vectorstore.docstore._dict)
            
            doc_meta = DocumentMetadata(
                filename=os.path.basename(file.name),
                title=title,
                author=author,
                upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                chunks=len(chunks)
            )
            doc_metadata.append(doc_meta)
            
            for i, chunk in enumerate(chunks):
                chunk_metadata = {
                    "content": chunk,
                    "source": os.path.basename(file.name),
                    "title": title,
                    "author": author,
                    "chunk_index": chunk_offset + i,
                    "total_chunks": len(chunks),
                    "upload_date": doc_meta.upload_date
                }
                documents.append(chunk_metadata)
            
        except Exception as e:
            logging.error(f"Errore durante la lettura del file {file.name}: {e}")
            continue

    if documents:
        try:
            texts = [doc["content"] for doc in documents]
            metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
            
            if existing_vectorstore:
                existing_vectorstore.add_texts(texts, metadatas=metadatas)
                vectorstore = existing_vectorstore
            else:
                vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
            
            vectorstore.save_local(db_path)
            
            final_metadata = merge_metadata([], doc_metadata, db_name)
            
            # Salva i metadati
            metadata_path = os.path.join(db_path, "metadata.json")
            with open(metadata_path, 'w') as f:
                json.dump(final_metadata, f, indent=2)
            
            return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
            
        except Exception as e:
            error_msg = f"Errore durante l'indicizzazione: {e}"
            logging.error(error_msg)
            return False, error_msg, ""
    
    return False, "Nessun documento processato.", ""

def list_indexed_files(db_name="default_db"):
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
    metadata_file = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(metadata_file):
        return "Nessun file nel database."
    
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        if not metadata:
            return "Nessun documento nel database."
        
        output = []
        for doc in metadata:
            output.append(
                f"📄 {doc['title']}\n"
                f"   Autore: {doc['author']}\n"
                f"   File: {doc['filename']}\n"
                f"   Chunks: {doc['chunks']}\n"
                f"   Caricato il: {doc['upload_date']}\n"
            )
        
        return "\n".join(output) if output else "Nessun documento nel database."
    except Exception as e:
        logging.error(f"Errore nella lettura dei metadati: {e}")
        return f"Errore nella lettura dei metadati: {e}"

def delete_file_from_database(file_name, db_name="default_db"):
    """
    Esempio semplificato: potresti voler rimuovere i chunk
    da FAISS. Attualmente, la funzione gestisce un 'file_list.txt',
    ma devi adattarla alle tue esigenze di rimozione dei chunk.
    """
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
    file_list_path = os.path.join(db_path, "file_list.txt")
    
    if not os.path.exists(file_list_path):
        return "Database non trovato (file_list.txt mancante)."
    
    try:
        # Leggi la lista dei file
        with open(file_list_path, "r") as f:
            files = f.readlines()
        
        # Rimuovi il file dalla lista
        files = [line.strip() for line in files if line.strip() != file_name]
        
        # Riscrivi la lista aggiornata
        with open(file_list_path, "w") as f:
            for fl in files:
                f.write(f"{fl}\n")
        
        return f"File '{file_name}' rimosso dal database '{db_name}'."
    except Exception as e:
        return f"Errore durante la rimozione del file: {e}"