Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Nugh75 commited on Jan 4

Commit

9804548

1 Parent(s): 03e1062

issue chunck risolto

Browse files

Files changed (7) hide show

app.py +5 -3
app/document_handling.py +57 -20
app/llm_handling.py +47 -29
db/.DS_Store +0 -0
ui/chunks_viewer_tab.py +104 -0
ui/db_management_tab.py +2 -1
ui/document_management_tab.py +8 -6

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ui.chatbot_tab import create_chatbot_tab
 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab
 from ui.info_tab import create_info_tab  # Importa la nuova tab
 # Configura il logging
 configure_logging()
@@ -24,21 +25,22 @@ def main():
             info_refs = create_info_tab()
             chat_refs = create_chatbot_tab()
             doc_refs = create_document_management_tab()
             db_refs = create_db_management_tab # Crea la nuova tab delle informazioni
             # Crea dizionario completo dei riferimenti
             dropdowns = {
                 "document": doc_refs,
                 "chat": chat_refs,
-                "info": info_refs
             }
             # Crea i tab nell'ordine corretto
             chat_refs                                    # Tab 4: Chatbot (ultima tab)
             doc_refs  # Tab 2: Document Management
             db_refs(dropdowns)
             info_refs                                    # Tab 5: Info (ultima tab)
             rag_chatbot.launch()

 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab
 from ui.info_tab import create_info_tab  # Importa la nuova tab
+from ui.chunks_viewer_tab import create_chunks_viewer_tab  # Aggiungi l'import in cima al file
 # Configura il logging
 configure_logging()
             info_refs = create_info_tab()
             chat_refs = create_chatbot_tab()
             doc_refs = create_document_management_tab()
+            chunks_refs = create_chunks_viewer_tab()  # Aggiungi il nuovo tab
             db_refs = create_db_management_tab # Crea la nuova tab delle informazioni
             # Crea dizionario completo dei riferimenti
             dropdowns = {
                 "document": doc_refs,
                 "chat": chat_refs,
+                "info": info_refs,
+                "chunks": chunks_refs  # Aggiungi il riferimento
             }
             # Crea i tab nell'ordine corretto
             chat_refs                                    # Tab 4: Chatbot (ultima tab)
             doc_refs  # Tab 2: Document Management
             db_refs(dropdowns)
+            chunks_refs  # Aggiungi il tab dei chunks
             info_refs                                    # Tab 5: Info (ultima tab)
             rag_chatbot.launch()

app/document_handling.py CHANGED Viewed

@@ -12,13 +12,44 @@ from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_doc
 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
 def upload_and_index(files, title, author, db_name="default_db"):
     if not files:
-        return "Nessun file caricato."
     documents = []
     doc_metadata = []
     for file in files:
         try:
             if file.name.endswith('.pdf'):
@@ -26,13 +57,16 @@ def upload_and_index(files, title, author, db_name="default_db"):
             elif file.name.endswith('.docx'):
                 text = extract_text_from_docx(file.name)
             else:
-                # File .txt o altro testo semplice
                 with open(file.name, 'r', encoding='utf-8') as f:
                     text = f.read()
             chunks = create_chunks(text)
-            # Metadata per il documento
             doc_meta = DocumentMetadata(
                 filename=os.path.basename(file.name),
                 title=title,
@@ -40,49 +74,52 @@ def upload_and_index(files, title, author, db_name="default_db"):
                 upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                 chunks=len(chunks)
             )
-            # Metadata per ogni chunk
             for i, chunk in enumerate(chunks):
                 chunk_metadata = {
                     "content": chunk,
                     "source": os.path.basename(file.name),
                     "title": title,
                     "author": author,
-                    "chunk_index": i,
                     "total_chunks": len(chunks),
                     "upload_date": doc_meta.upload_date
                 }
                 documents.append(chunk_metadata)
-            doc_metadata.append(doc_meta)
         except Exception as e:
             logging.error(f"Errore durante la lettura del file {file.name}: {e}")
             continue
     if documents:
         try:
-            db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
-            os.makedirs(db_path, exist_ok=True)
-            # Usa la funzione centralizzata invece dell'inizializzazione diretta
-            embeddings = get_embeddings()
             texts = [doc["content"] for doc in documents]
             metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
-            # Crea o sovrascrivi l'indice FAISS con questi documenti
-            vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
             vectorstore.save_local(db_path)
-            # Salva i metadati del documento su file
-            save_metadata(doc_metadata, db_name)
-            return f"Documenti indicizzati con successo nel database '{db_name}'!"
         except Exception as e:
-            logging.error(f"Errore durante l'indicizzazione: {e}")
-            return f"Errore durante l'indicizzazione: {e}"
-    return "Nessun documento processato."
 def list_indexed_files(db_name="default_db"):
     db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui

 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
+def merge_metadata(existing_metadata, new_metadata, db_name):
+    """Unisce i metadati esistenti con i nuovi."""
+    metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
+    if os.path.exists(metadata_path):
+        with open(metadata_path, 'r') as f:
+            existing_metadata = json.load(f)
+    else:
+        existing_metadata = []
+    # Converte i nuovi metadati in dizionari
+    new_metadata_dicts = [meta.to_dict() if hasattr(meta, 'to_dict') else meta for meta in new_metadata]
+    existing_metadata.extend(new_metadata_dicts)
+    return existing_metadata
 def upload_and_index(files, title, author, db_name="default_db"):
     if not files:
+        return False, "Nessun file caricato.", ""
     documents = []
     doc_metadata = []
+    # Crea directory del database se non esiste
+    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+    os.makedirs(db_path, exist_ok=True)
+    embeddings = get_embeddings()
+    existing_vectorstore = None
+    try:
+        if os.path.exists(os.path.join(db_path, "index.faiss")):
+            existing_vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
+    except Exception as e:
+        logging.error(f"Errore caricamento vectorstore esistente: {e}")
+        existing_vectorstore = None
+    # Processa i nuovi file
     for file in files:
         try:
             if file.name.endswith('.pdf'):
             elif file.name.endswith('.docx'):
                 text = extract_text_from_docx(file.name)
             else:
                 with open(file.name, 'r', encoding='utf-8') as f:
                     text = f.read()
             chunks = create_chunks(text)
+            # Calcola l'offset per i nuovi chunks
+            chunk_offset = 0
+            if existing_vectorstore:
+                chunk_offset = len(existing_vectorstore.docstore._dict)
             doc_meta = DocumentMetadata(
                 filename=os.path.basename(file.name),
                 title=title,
                 upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                 chunks=len(chunks)
             )
+            doc_metadata.append(doc_meta)
             for i, chunk in enumerate(chunks):
                 chunk_metadata = {
                     "content": chunk,
                     "source": os.path.basename(file.name),
                     "title": title,
                     "author": author,
+                    "chunk_index": chunk_offset + i,
                     "total_chunks": len(chunks),
                     "upload_date": doc_meta.upload_date
                 }
                 documents.append(chunk_metadata)
         except Exception as e:
             logging.error(f"Errore durante la lettura del file {file.name}: {e}")
             continue
     if documents:
         try:
             texts = [doc["content"] for doc in documents]
             metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
+            if existing_vectorstore:
+                existing_vectorstore.add_texts(texts, metadatas=metadatas)
+                vectorstore = existing_vectorstore
+            else:
+                vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
             vectorstore.save_local(db_path)
+            final_metadata = merge_metadata([], doc_metadata, db_name)
+            # Salva i metadati
+            metadata_path = os.path.join(db_path, "metadata.json")
+            with open(metadata_path, 'w') as f:
+                json.dump(final_metadata, f, indent=2)
+            return True, "Documenti indicizzati con successo!", f"Database '{db_name}' aggiornato"
         except Exception as e:
+            error_msg = f"Errore durante l'indicizzazione: {e}"
+            logging.error(error_msg)
+            return False, error_msg, ""
+    return False, "Nessun documento processato.", ""
 def list_indexed_files(db_name="default_db"):
     db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui

app/llm_handling.py CHANGED Viewed

@@ -5,9 +5,14 @@ from langchain_community.vectorstores import FAISS
 import requests
 from tenacity import retry, stop_after_attempt, wait_exponential
 import json
-from app.config import BASE_DB_PATH  # Ensure correct import
-from app.config import LLM_CONFIGS, LLMType  # Import LLMType and LLM_CONFIGS
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.utils.embedding_utils import get_embeddings
@@ -47,20 +52,21 @@ def read_metadata(db_path):
             return json.load(f)
     return []
-def get_relevant_documents(vectorstore, question, min_similarity=0.7):
     """Retrieves relevant documents from the vectorstore"""
     try:
         enhanced_query = enhance_query(question)
         docs_and_scores = vectorstore.similarity_search_with_score(
             enhanced_query,
-            k=8
         )
         filtered_docs = [
-            doc for doc, score in docs_and_scores if score >= min_similarity
         ]
         logging.info(f"Query: {question}")
         logging.info(f"Documents found: {len(filtered_docs)}")
-        return filtered_docs[:5] if filtered_docs else []
     except Exception as e:
         logging.error(f"Error retrieving documents: {e}")
         return []
@@ -68,8 +74,7 @@ def get_relevant_documents(vectorstore, question, min_similarity=0.7):
 def enhance_query(question):
     stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
     words = [w for w in question.lower().split() if w not in stop_words]
-    enhanced_query = " ".join(words)
-    return enhanced_query
 def log_search_results(question, docs_and_scores):
     logging.info(f"Query: {question}")
@@ -78,39 +83,52 @@ def log_search_results(question, docs_and_scores):
         logging.info(f"Content: {doc.page_content[:100]}...")
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=None):
     if chat_history is None:
         chat_history = []
     try:
-        embeddings = get_embeddings()
         db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
-        metadata_list = read_metadata(db_path)
-        metadata_dict = {m["filename"]: m for m in metadata_list}
         vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
         relevant_docs = get_relevant_documents(vectorstore, question)
         if not relevant_docs:
             return [
                 {"role": "user", "content": question},
-                {"role": "assistant", "content": "Sorry, no relevant information found to answer your question. Try rephrasing or asking a different question."}
             ]
         sources = []
-        for idx, doc in enumerate(relevant_docs, 1):
-            source_file = doc.metadata.get("source", "Unknown")
-            if source_file in metadata_dict:
-                meta = metadata_dict[source_file]
-                sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Part {idx} of {len(relevant_docs)}")
-        context = "\n".join([
-            f"[Part {idx+1} of {len(relevant_docs)}]\n{doc.page_content}"
-            for idx, doc in enumerate(relevant_docs)
-        ])
-        sources_text = "\n\nSources consulted:\n" + "\n".join(set(sources))
         prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
-        prompt += "\nAlways cite the sources used for your response, including the document title and author."
         messages = [
             {"role": "system", "content": prompt},
             *[{"role": m["role"], "content": m["content"]} for m in chat_history],
             {"role": "user", "content": question}
         ]
         client, model = get_llm_client(llm_type)
         response = client.chat.completions.create(
             model=model,
@@ -118,14 +136,15 @@ def answer_question(question, db_name, prompt_type="tutor", chat_history=None, l
             temperature=0.7,
             max_tokens=2048
         )
         answer = response.choices[0].message.content + sources_text
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": answer}
         ]
     except Exception as e:
-        logging.error(f"Error generating response: {e}")
-        error_msg = "Local LLM not available. Try again later or use OpenAI." if "local" in str(llm_type) else str(e)
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": f"⚠️ {error_msg}"}
@@ -144,11 +163,10 @@ class DocumentRetriever:
         enhanced_query = enhance_query(question)
         docs_and_scores = self.vectorstore.similarity_search_with_score(
             enhanced_query,
-            k=8
         )
         log_search_results(question, docs_and_scores)
-        # Implement _filter_relevant_docs or remove the call
-        # return self._filter_relevant_docs(docs_and_scores)
 if __name__ == "__main__":
     pass

 import requests
 from tenacity import retry, stop_after_attempt, wait_exponential
 import json
+from collections import defaultdict
+from app.config import (
+    BASE_DB_PATH,
+    LLM_CONFIGS,
+    LLMType,
+    EMBEDDING_CONFIG
+)
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.utils.embedding_utils import get_embeddings
             return json.load(f)
     return []
+def get_relevant_documents(vectorstore, question):
     """Retrieves relevant documents from the vectorstore"""
     try:
         enhanced_query = enhance_query(question)
         docs_and_scores = vectorstore.similarity_search_with_score(
             enhanced_query,
+            k=EMBEDDING_CONFIG['k_documents']
         )
         filtered_docs = [
+            doc for doc, score in docs_and_scores
+            if score >= EMBEDDING_CONFIG['min_similarity']
         ]
         logging.info(f"Query: {question}")
         logging.info(f"Documents found: {len(filtered_docs)}")
+        return filtered_docs if filtered_docs else []
     except Exception as e:
         logging.error(f"Error retrieving documents: {e}")
         return []
 def enhance_query(question):
     stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
     words = [w for w in question.lower().split() if w not in stop_words]
+    return " ".join(words)
 def log_search_results(question, docs_and_scores):
     logging.info(f"Query: {question}")
         logging.info(f"Content: {doc.page_content[:100]}...")
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
     if chat_history is None:
         chat_history = []
     try:
+        # Setup e recupero documenti
         db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+        embeddings = get_embeddings()
         vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
         relevant_docs = get_relevant_documents(vectorstore, question)
         if not relevant_docs:
             return [
                 {"role": "user", "content": question},
+                {"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti."}
             ]
+        # Leggi metadata.json per il totale dei chunks
+        metadata_path = os.path.join("db", f"faiss_index_{db_name}", "metadata.json")
+        with open(metadata_path, 'r') as f:
+            metadata_list = json.load(f)
+        # Crea dizionario titolo -> chunks
+        total_chunks = {doc['title']: doc['chunks'] for doc in metadata_list}
+        # Prepara le fonti
         sources = []
+        for doc in relevant_docs:
+            meta = doc.metadata
+            title = meta.get('title', 'Unknown')
+            chunk_index = meta.get('chunk_index', 0)
+            total_doc_chunks = total_chunks.get(title, 0)
+            sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Chunk {chunk_index+1} di {total_doc_chunks}")
+        # Prepara contesto e prompt
+        context = "\n".join([doc.page_content for doc in relevant_docs])
+        sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
         prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
+        prompt += "\nCita sempre le fonti utilizzate nella risposta, inclusi titolo e autore."
+        # Crea messaggio e ottieni risposta
         messages = [
             {"role": "system", "content": prompt},
             *[{"role": m["role"], "content": m["content"]} for m in chat_history],
             {"role": "user", "content": question}
         ]
         client, model = get_llm_client(llm_type)
         response = client.chat.completions.create(
             model=model,
             temperature=0.7,
             max_tokens=2048
         )
         answer = response.choices[0].message.content + sources_text
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": answer}
         ]
     except Exception as e:
+        logging.error(f"Error in answer_question: {e}")
+        error_msg = "LLM locale non disponibile." if "local" in str(llm_type) else str(e)
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": f"⚠️ {error_msg}"}
         enhanced_query = enhance_query(question)
         docs_and_scores = self.vectorstore.similarity_search_with_score(
             enhanced_query,
+            k=EMBEDDING_CONFIG['k_documents']
         )
         log_search_results(question, docs_and_scores)
+        return [doc for doc, _ in docs_and_scores]
 if __name__ == "__main__":
     pass

db/.DS_Store CHANGED Viewed

Binary files a/db/.DS_Store and b/db/.DS_Store differ

ui/chunks_viewer_tab.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+import logging
+import os
+import json
+from langchain.vectorstores import FAISS
+from app.document_handling import get_embeddings
+from app.config import BASE_DB_PATH
+from app.utils.database_handling import list_databases
+def create_chunks_viewer_tab():
+    """Crea il tab per visualizzare i chunks dei database."""
+    def load_chunks(db_name):
+        """Carica la lista dei chunks dal database selezionato."""
+        if not db_name:
+            return gr.Dropdown(choices=[], interactive=False), "Seleziona un database"
+        try:
+            metadata_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}", "metadata.json")
+            with open(metadata_path, 'r') as f:
+                metadata = json.load(f)
+            # Crea lista di chunks con formato "Chunk X - Titolo"
+            chunk_list = []
+            for doc in metadata:
+                for i in range(doc['chunks']):
+                    chunk_list.append(f"Chunk {i+1} - {doc['title']}")
+            return gr.Dropdown(choices=chunk_list, interactive=True), ""
+        except Exception as e:
+            logging.error(f"Errore nel caricamento chunks: {e}")
+            return gr.Dropdown(choices=[], interactive=False), f"Errore: {e}"
+    def inspect_chunk(db_name, chunk_id):
+        """Recupera il contenuto del chunk selezionato."""
+        if not db_name or not chunk_id:
+            return "Seleziona un database e un chunk"
+        try:
+            db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+            embeddings = get_embeddings()
+            vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
+            # Estrai il numero del chunk dal formato "Chunk X - Titolo"
+            chunk_num = int(chunk_id.split(" - ")[0].replace("Chunk ", "")) - 1
+            # Verifica che l'indice sia valido
+            doc_ids = list(vectorstore.docstore._dict.keys())
+            if chunk_num >= len(doc_ids):
+                return f"Errore: chunk {chunk_num + 1} non trovato nel database"
+            chunk_content = vectorstore.docstore._dict[doc_ids[chunk_num]].page_content
+            return chunk_content
+        except Exception as e:
+            logging.error(f"Errore nell'ispezione del chunk: {e}")
+            return f"Errore nel recupero del contenuto: {e}"
+    with gr.Tab("Visualizza Chunks"):
+        gr.Markdown("## Ispeziona Chunks dei Database")
+        with gr.Row():
+            with gr.Column():
+                # Selettori
+                db_selector = gr.Dropdown(
+                    choices=list_databases(),
+                    label="Seleziona Database",
+                    value=list_databases()[0] if list_databases() else None
+                )
+                chunk_selector = gr.Dropdown(
+                    choices=[],
+                    label="Seleziona Chunk",
+                    interactive=False
+                )
+                inspect_button = gr.Button("Visualizza Contenuto")
+            with gr.Column():
+                # Area visualizzazione contenuto
+                chunk_content = gr.TextArea(
+                    label="Contenuto del Chunk",
+                    interactive=False,
+                    lines=20
+                )
+                error_box = gr.Textbox(
+                    label="Status",
+                    visible=True,
+                    interactive=False
+                )
+        # Eventi
+        db_selector.change(
+            fn=load_chunks,
+            inputs=[db_selector],
+            outputs=[chunk_selector, error_box]
+        )
+        inspect_button.click(
+            fn=inspect_chunk,
+            inputs=[db_selector, chunk_selector],
+            outputs=[chunk_content]
+        )
+    return {"db_selector": db_selector}

ui/db_management_tab.py CHANGED Viewed

@@ -9,6 +9,8 @@ def create_db_management_tab(dropdowns):
         # Aggiorna tutti i dropdown dell'applicazione (5 invece di 4)
         return [gr.update(choices=updated_dbs) for _ in range(5)]
     with gr.Tab("Gestione Database"):
         gr.Markdown("## Operazioni sui Database")
@@ -86,4 +88,3 @@ def create_db_management_tab(dropdowns):
     # Ritorna i componenti che vogliamo poter aggiornare/agganciare
     return [modify_db_old_name, delete_db_dropdown, create_db_button, modify_db_button, delete_db_button]

         # Aggiorna tutti i dropdown dell'applicazione (5 invece di 4)
         return [gr.update(choices=updated_dbs) for _ in range(5)]
     with gr.Tab("Gestione Database"):
         gr.Markdown("## Operazioni sui Database")
     # Ritorna i componenti che vogliamo poter aggiornare/agganciare
     return [modify_db_old_name, delete_db_dropdown, create_db_button, modify_db_button, delete_db_button]

ui/document_management_tab.py CHANGED Viewed

@@ -16,14 +16,16 @@ def create_document_management_tab():
         ]
     def upload_and_index_callback(files, title, author, db_name):
-        """Carica e indicizza i documenti, quindi aggiorna la lista dei file."""
         try:
-            status = upload_and_index(files, title, author, db_name)
-            logging.info(f"Upload completato: {status}")
-            return status, update_dropdowns()[0], update_dropdowns()[1]
         except Exception as e:
-            logging.error(f"Errore durante l'upload: {str(e)}")
-            return f"Errore: {str(e)}"
     def list_files_callback(db_name):
         """Elenca i file indicizzati nel database conoscenze specificato."""

         ]
     def upload_and_index_callback(files, title, author, db_name):
         try:
+            success, message, details = upload_and_index(files, title, author, db_name)
+            if success:
+                return message, list_databases(), list_databases()
+            else:
+                return message, list_databases(), list_databases()
         except Exception as e:
+            error_msg = f"Errore durante l'upload: {e}"
+            logging.error(error_msg)
+            return error_msg, list_databases(), list_databases()
     def list_files_callback(db_name):
         """Elenca i file indicizzati nel database conoscenze specificato."""