File size: 6,837 Bytes
47e4aa2
ac52d7a
47e4aa2
 
 
 
 
 
 
ac52d7a
080146c
 
 
 
47e4aa2
 
 
 
 
ac52d7a
 
 
 
 
 
 
 
 
47e4aa2
 
 
 
 
 
 
 
ac52d7a
 
 
 
 
 
 
 
 
47e4aa2
 
 
 
 
 
080146c
 
 
 
 
 
 
 
 
ac52d7a
 
080146c
 
 
 
ac52d7a
 
 
47e4aa2
080146c
47e4aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
b2638ec
47e4aa2
 
 
080146c
47e4aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a45dfb0
47e4aa2
 
080146c
 
47e4aa2
 
 
b2638ec
47e4aa2
 
 
b2638ec
47e4aa2
 
b2638ec
47e4aa2
 
 
 
 
 
 
a45dfb0
47e4aa2
 
 
 
 
 
 
 
 
b2638ec
 
 
47e4aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2638ec
 
 
 
 
a45dfb0
47e4aa2
 
 
b2638ec
47e4aa2
 
 
 
 
 
 
b2638ec
47e4aa2
 
 
b2638ec
 
47e4aa2
b2638ec
47e4aa2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import logging
import gradio as gr
from langchain_community.vectorstores import FAISS
import os
import PyPDF2
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from datetime import datetime
from app.functions.database_handling import BASE_DB_PATH
from langchain_community.embeddings import HuggingFaceEmbeddings
from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
from app.utils.embedding_utils import get_embeddings
from app.utils.dataclass_utils import DocumentMetadata, save_metadata


# -------------- UTILITY FUNCTIONS --------------

def extract_text_from_pdf(file_path):
    """
    Estrae il testo da un file PDF.
    
    Args:
        file_path: Percorso del file PDF
        
    Returns:
        str: Testo estratto dal PDF
    """
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

def extract_text_from_docx(file_path):
    """
    Estrae il testo da un file DOCX.
    
    Args:
        file_path: Percorso del file DOCX
        
    Returns:
        str: Testo estratto dal documento Word
    """
    doc = Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def create_chunks(text):
    from app.config import EMBEDDING_CONFIG
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=EMBEDDING_CONFIG["chunk_size"],
        chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter.split_text(text)


def create_vectorstore(texts, metadatas, db_path):
    embeddings = get_embeddings()
    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
   



# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

def upload_and_index(files, title, author, db_name="default_db"):
    if not files:
        return "Nessun file caricato."
        
    documents = []
    doc_metadata = []
    
    for file in files:
        try:
            if file.name.endswith('.pdf'):
                text = extract_text_from_pdf(file.name)
            elif file.name.endswith('.docx'):
                text = extract_text_from_docx(file.name)
            else:
                # File .txt o altro testo semplice
                with open(file.name, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
            chunks = create_chunks(text)
            
            # Metadata per il documento
            doc_meta = DocumentMetadata(
                filename=os.path.basename(file.name),
                title=title,
                author=author,
                upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                chunks=len(chunks)
            )
            
            # Metadata per ogni chunk
            for i, chunk in enumerate(chunks):
                chunk_metadata = {
                    "content": chunk,
                    "source": os.path.basename(file.name),
                    "title": title,
                    "author": author,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "upload_date": doc_meta.upload_date
                }
                documents.append(chunk_metadata)
            
            doc_metadata.append(doc_meta)
            
        except Exception as e:
            logging.error(f"Errore durante la lettura del file {file.name}: {e}")
            continue

    if documents:
        try:
            db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
            os.makedirs(db_path, exist_ok=True)
            
            # Usa la funzione centralizzata invece dell'inizializzazione diretta
            embeddings = get_embeddings()
            texts = [doc["content"] for doc in documents]
            metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
            
            # Crea o sovrascrivi l'indice FAISS con questi documenti
            vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
            vectorstore.save_local(db_path)
            
            # Salva i metadati del documento su file
            save_metadata(doc_metadata, db_name)
            
            return f"Documenti indicizzati con successo nel database '{db_name}'!"
        except Exception as e:
            logging.error(f"Errore durante l'indicizzazione: {e}")
            return f"Errore durante l'indicizzazione: {e}"
    
    return "Nessun documento processato."

def list_indexed_files(db_name="default_db"):
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
    metadata_file = os.path.join(db_path, "metadata.json")
    
    if not os.path.exists(metadata_file):
        return "Nessun file nel database."
    
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        if not metadata:
            return "Nessun documento nel database."
        
        output = []
        for doc in metadata:
            output.append(
                f"📄 {doc['title']}\n"
                f"   Autore: {doc['author']}\n"
                f"   File: {doc['filename']}\n"
                f"   Chunks: {doc['chunks']}\n"
                f"   Caricato il: {doc['upload_date']}\n"
            )
        
        return "\n".join(output) if output else "Nessun documento nel database."
    except Exception as e:
        logging.error(f"Errore nella lettura dei metadati: {e}")
        return f"Errore nella lettura dei metadati: {e}"

def delete_file_from_database(file_name, db_name="default_db"):
    """
    Esempio semplificato: potresti voler rimuovere i chunk
    da FAISS. Attualmente, la funzione gestisce un 'file_list.txt',
    ma devi adattarla alle tue esigenze di rimozione dei chunk.
    """
    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
    file_list_path = os.path.join(db_path, "file_list.txt")
    
    if not os.path.exists(file_list_path):
        return "Database non trovato (file_list.txt mancante)."
    
    try:
        # Leggi la lista dei file
        with open(file_list_path, "r") as f:
            files = f.readlines()
        
        # Rimuovi il file dalla lista
        files = [line.strip() for line in files if line.strip() != file_name]
        
        # Riscrivi la lista aggiornata
        with open(file_list_path, "w") as f:
            for fl in files:
                f.write(f"{fl}\n")
        
        return f"File '{file_name}' rimosso dal database '{db_name}'."
    except Exception as e:
        return f"Errore durante la rimozione del file: {e}"