Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Nugh75 commited on Jan 4

Commit

080146c

1 Parent(s): 2a36d42

moduliazione del programma

Browse files

Ogni funzionalità ha il proprio file e è nella propria cartella, bisogna continuare a raffinare il programma

Files changed (26) hide show

app.py +1 -3
app/config.py +48 -1
app/document_handling.py +21 -109
app/functions/database_handling.py +1 -3
app/llm_handling.py +92 -198
app/llm_handling2.py +188 -0
{utils → app/utils}/__init__.py +0 -0
app/utils/dataclass_utils.py +59 -0
app/utils/embedding_utils.py +12 -0
{utils → app/utils}/helpers.py +0 -0
app/utils/markdowns_utils.py +14 -0
app/utils/voice_utils.py +71 -0
db/.DS_Store +0 -0
db/faiss_index/index.faiss +0 -0
db/faiss_index/index.pkl +0 -3
db/faiss_index_Daniele2/.DS_Store +0 -0
db/faiss_index_Daniele2/index.faiss +0 -0
db/faiss_index_Daniele2/index.pkl +0 -3
db/faiss_index_Daniele2/metadata.json +0 -9
db/faiss_index_Orienta/index.faiss +0 -3
db/faiss_index_Orienta/index.pkl +0 -3
db/faiss_index_Orienta/metadata.json +0 -9
db/faiss_index_default_db/index.faiss +0 -0
db/faiss_index_default_db/index.pkl +0 -3
ui/chatbot_tab.py +38 -37
ui/new_features_tab.py +0 -43

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from app.functions.database_handling import list_databases
 from ui.chatbot_tab import create_chatbot_tab
 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab
-from ui.new_features_tab import create_new_features_tab
 from ui.info_tab import create_info_tab  # Importa la nuova tab
 # Configura il logging
@@ -39,8 +38,7 @@ def main():
             chat_refs                                    # Tab 4: Chatbot (ultima tab)
             doc_refs  # Tab 2: Document Management
             db_refs(dropdowns)
-            # create_db_management_tab(dropdowns)          # Tab 1: DB Management
-            create_new_features_tab()                    # Tab 3: Features
             info_refs                                    # Tab 5: Info (ultima tab)
             rag_chatbot.launch()

 from ui.chatbot_tab import create_chatbot_tab
 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab
 from ui.info_tab import create_info_tab  # Importa la nuova tab
 # Configura il logging
             chat_refs                                    # Tab 4: Chatbot (ultima tab)
             doc_refs  # Tab 2: Document Management
             db_refs(dropdowns)
             info_refs                                    # Tab 5: Info (ultima tab)
             rag_chatbot.launch()

app/config.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import os
 from dotenv import load_dotenv
 # Carica le variabili d'ambiente dal file .env
 load_dotenv()
@@ -7,4 +12,46 @@ load_dotenv()
 # Configurazione del modello
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    raise ValueError("OPENAI_API_KEY non trovata. Verifica il file .env")

 import os
 from dotenv import load_dotenv
+from enum import Enum
+from openai import OpenAI
+from pathlib import Path
 # Carica le variabili d'ambiente dal file .env
 load_dotenv()
 # Configurazione del modello
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    raise ValueError("OPENAI_API_KEY non trovata. Verifica il file .env")
+class LLMType(Enum):
+    OPENAI_GPT_4O_MINI = "openai - GPT-4o-mini"
+    LOCAL_QWEN = "local - Qwen 7B"
+    LOCAL_PHI = "local - Phi-3 Mini"
+# Configurazione modelli
+LLM_CONFIGS = {
+    LLMType.OPENAI_GPT_4O_MINI: {
+        "client": lambda: OpenAI(api_key=OPENAI_API_KEY),
+        "model": "gpt-4-mini",
+        "base_url": None
+    },
+    LLMType.LOCAL_QWEN: {
+        "client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
+        "model": "qwen2.5-coder-7b-instruct",
+        "base_url": "http://192.168.82.5:1234/v1"
+    },
+    LLMType.LOCAL_PHI: {
+        "client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
+        "model": "phi-3.5-mini-ita",
+        "base_url": "http://192.168.82.5:1234/v1"
+    }
+}
+EMBEDDING_CONFIG = {
+    "model_name": "sentence-transformers/multi-qa-mpnet-base-dot-v1",
+    "chunk_size": 2000,
+    "chunk_overlap": 100,
+    "k_documents": 5,
+    "min_similarity": 0.7
+}
+# Aggiungi questa costante
+EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+# Definisci il percorso base per i database
+BASE_DB_PATH = "db"
+# Voci italiane edge-tts
+VOICE_USER = "it-IT-DiegoNeural"      # Voce maschile utente
+VOICE_ASSISTANT = "it-IT-ElsaNeural"   # Voce femminile assistente

app/document_handling.py CHANGED Viewed

@@ -1,79 +1,20 @@
 import logging
 import gradio as gr
 from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEmbeddings
 import os
-import shutil
 import PyPDF2
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from dataclasses import dataclass
 import json
 from datetime import datetime
 from app.functions.database_handling import BASE_DB_PATH
-# Initialize the text splitter
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
 # -------------- UTILITY FUNCTIONS --------------
-@dataclass
-class DocumentMetadata:
-    """
-    Classe per gestire i metadati dei documenti.
-    Attributi:
-        filename (str): Nome del file originale
-        title (str): Titolo assegnato al documento
-        author (str): Autore del documento
-        upload_date (str): Data di caricamento
-        chunks (int): Numero di chunks in cui è stato diviso il documento
-    """
-    filename: str
-    title: str
-    author: str
-    upload_date: str
-    chunks: int
-    def to_dict(self):
-        """Converte i metadati in un dizionario per il salvataggio JSON."""
-        return {
-            "filename": self.filename,
-            "title": self.title,
-            "author": self.author,
-            "upload_date": self.upload_date,
-            "chunks": self.chunks
-        }
-def save_metadata(metadata_list, db_name):
-    """
-    Salva i metadati dei documenti nel database specificato.
-    Args:
-        metadata_list: Lista di oggetti DocumentMetadata da salvare
-        db_name: Nome del database in cui salvare i metadati
-    Note:
-        I metadati vengono salvati in un file JSON nella directory del database
-    """
-    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
-    metadata_file = os.path.join(db_path, "metadata.json")
-    # Crea la directory se non esiste
-    if not os.path.exists(db_path):
-        os.makedirs(db_path)
-    # Carica metadati esistenti se presenti
-    existing_metadata = []
-    if os.path.exists(metadata_file):
-        with open(metadata_file, 'r') as f:
-            existing_metadata = json.load(f)
-    # Aggiungi nuovi metadati
-    existing_metadata.extend([m.to_dict() for m in metadata_list])
-    # Salva il file aggiornato
-    with open(metadata_file, 'w') as f:
-        json.dump(existing_metadata, f, indent=2)
 def extract_text_from_pdf(file_path):
     """
@@ -108,34 +49,26 @@ def extract_text_from_docx(file_path):
         text += para.text + "\n"
     return text
-# -------------- CHATBOT TAB FUNCTIONS --------------
-def answer_question(question, db_name="default_db"):
-    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
-    if not os.path.exists(db_path):
-        logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
-        return "Database non trovato."
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
-    # Perform a similarity search
-    docs = vectorstore.similarity_search(question)
-    if not docs:
-        return "Nessun documento corrispondente alla query."
-    # Collect the document contents
-    results = [doc.page_content for doc in docs]
-    return "\n\n".join(results)
 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
 def upload_and_index(files, title, author, db_name="default_db"):
     if not files:
         return "Nessun file caricato."
@@ -154,7 +87,7 @@ def upload_and_index(files, title, author, db_name="default_db"):
                 with open(file.name, 'r', encoding='utf-8') as f:
                     text = f.read()
-            chunks = text_splitter.split_text(text)
             # Metadata per il documento
             doc_meta = DocumentMetadata(
@@ -189,7 +122,8 @@ def upload_and_index(files, title, author, db_name="default_db"):
             db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
             os.makedirs(db_path, exist_ok=True)
-            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
             texts = [doc["content"] for doc in documents]
             metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
@@ -265,26 +199,4 @@ def delete_file_from_database(file_name, db_name="default_db"):
     except Exception as e:
         return f"Errore durante la rimozione del file: {e}"
-# -------------- NEW FEATURES TAB FUNCTIONS --------------
-def search_documents(query, db_name="default_db"):
-    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
-    if not os.path.exists(db_path):
-        logging.warning(f"L'indice FAISS per il database '{db_name}' non esiste.")
-        return "Database non trovato."
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
-    # Perform a similarity search
-    docs = vectorstore.similarity_search(query)
-    if not docs:
-        return "Nessun documento corrispondente alla query."
-    # Collect the document contents
-    results = [doc.page_content for doc in docs]
-    return "\n\n".join(results)
-def generate_summary(db_name="default_db"):
-    # Placeholder per la logica di summarization
-    return "This is a summary of the documents in the database."

 import logging
 import gradio as gr
 from langchain_community.vectorstores import FAISS
 import os
 import PyPDF2
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import json
 from datetime import datetime
 from app.functions.database_handling import BASE_DB_PATH
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
+from app.utils.embedding_utils import get_embeddings
+from app.utils.dataclass_utils import DocumentMetadata, save_metadata
 # -------------- UTILITY FUNCTIONS --------------
 def extract_text_from_pdf(file_path):
     """
         text += para.text + "\n"
     return text
+def create_chunks(text):
+    from app.config import EMBEDDING_CONFIG
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=EMBEDDING_CONFIG["chunk_size"],
+        chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    return text_splitter.split_text(text)
+def create_vectorstore(texts, metadatas, db_path):
+    embeddings = get_embeddings()
+    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
 def upload_and_index(files, title, author, db_name="default_db"):
     if not files:
         return "Nessun file caricato."
                 with open(file.name, 'r', encoding='utf-8') as f:
                     text = f.read()
+            chunks = create_chunks(text)
             # Metadata per il documento
             doc_meta = DocumentMetadata(
             db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Modifica qui
             os.makedirs(db_path, exist_ok=True)
+            # Usa la funzione centralizzata invece dell'inizializzazione diretta
+            embeddings = get_embeddings()
             texts = [doc["content"] for doc in documents]
             metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
     except Exception as e:
         return f"Errore durante la rimozione del file: {e}"

app/functions/database_handling.py CHANGED Viewed

@@ -3,9 +3,7 @@ import os
 import shutil
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
-# Definisci il percorso base per i database
-BASE_DB_PATH = "db"
 # Crea la cartella db se non esiste
 if not os.path.exists(BASE_DB_PATH):

 import shutil
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
+from app.config import BASE_DB_PATH
 # Crea la cartella db se non esiste
 if not os.path.exists(BASE_DB_PATH):

app/llm_handling.py CHANGED Viewed

@@ -1,142 +1,39 @@
 import logging
 import os
-import shutil
-from enum import Enum
-from openai import OpenAI
 from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-import gradio as gr
-import asyncio
-import edge_tts
-from pathlib import Path
 import requests
 from tenacity import retry, stop_after_attempt, wait_exponential
-from app.config import OPENAI_API_KEY
-from app.functions.database_handling import BASE_DB_PATH  # Aggiungi questo import
 from app.configs.prompts import SYSTEM_PROMPTS
-import json  # Prima importa json se non è già importato
 logging.basicConfig(level=logging.INFO)
-local_ip="192.168.82.5:1234"
-class LLMType(Enum):
-    OPENAI_GPT_4O_MINI = "openai - GPT-4o-mini"
-    LOCAL_QWEN = "local - Qwen 7B"
-    LOCAL_PHI = "local - Phi-3 Mini"
-# Configurazione modelli
-LLM_CONFIGS = {
-    LLMType.OPENAI_GPT_4O_MINI: {
-        "client": lambda: OpenAI(api_key=OPENAI_API_KEY),
-        "model": "gpt-4-mini",
-        "base_url": None
-    },
-    LLMType.LOCAL_QWEN: {
-        "client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
-        "model": "qwen2.5-coder-7b-instruct",
-        "base_url": "http://192.168.82.5:1234/v1"
-    },
-    LLMType.LOCAL_PHI: {
-        "client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
-        "model": "phi-3.5-mini-ita",
-        "base_url": "http://192.168.82.5:1234/v1"
-    }
-}
 def get_llm_client(llm_type: LLMType):
-    """Ottiene il client appropriato per il modello selezionato"""
     config = LLM_CONFIGS.get(llm_type)
     if not config:
-        raise ValueError(f"Modello {llm_type} non supportato")
-    return config["client"](), config["model"]
-# Voci italiane edge-tts
-VOICE_USER = "it-IT-DiegoNeural"      # Voce maschile utente
-VOICE_ASSISTANT = "it-IT-ElsaNeural"   # Voce femminile assistente
-async def text_to_speech(text, voice_name, output_file):
-    """Genera audio usando edge-tts"""
-    communicate = edge_tts.Communicate(text, voice_name)
-    await communicate.save(output_file)
-def generate_speech(text, is_user=True):
-    try:
-        # Crea directory per audio temporanei
-        audio_dir = Path("temp_audio")
-        audio_dir.mkdir(exist_ok=True)
-        # Seleziona voce e genera nome file
-        voice = VOICE_USER if is_user else VOICE_ASSISTANT
-        file_name = f"speech_{hash(text)}.mp3"
-        output_path = audio_dir / file_name
-        # Genera audio
-        asyncio.run(text_to_speech(text, voice, str(output_path)))
-        return str(output_path)
-    except Exception as e:
-        logging.error(f"Errore TTS: {e}")
-        return None
-import re
-def clean_markdown(text):
-    """Rimuove markdown dal testo"""
-    text = re.sub(r'```[\s\S]*?```', '', text)  # blocchi codice
-    text = re.sub(r'`.*?`', '', text)           # codice inline
-    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # link
-    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # bold
-    text = re.sub(r'\*(.*?)\*', r'\1', text)      # italic
-    return text.strip()
-def generate_chat_audio(chat_history):
-    """Genera audio della conversazione con voci alternate"""
-    try:
-        audio_files = []
-        audio_dir = Path("temp_audio")
-        audio_dir.mkdir(exist_ok=True)
-        # Genera audio per ogni messaggio
-        for msg in chat_history:
-            content = clean_markdown(msg["content"])
-            if not content.strip():
-                continue
-            voice = VOICE_USER if msg["role"] == "user" else VOICE_ASSISTANT
-            file_name = f"chat_{msg['role']}_{hash(content)}.mp3"
-            output_path = audio_dir / file_name
-            # Genera audio senza prefissi
-            asyncio.run(text_to_speech(content, voice, str(output_path)))
-            audio_files.append(str(output_path))
-        # Combina tutti gli audio
-        if audio_files:
-            from pydub import AudioSegment
-            combined = AudioSegment.empty()
-            for audio_file in audio_files:
-                segment = AudioSegment.from_mp3(audio_file)
-                combined += segment
-            final_path = audio_dir / f"chat_complete_{hash(str(chat_history))}.mp3"
-            combined.export(str(final_path), format="mp3")
-            return str(final_path)
-        return None
-    except Exception as e:
-        logging.error(f"Errore generazione audio: {e}")
-        return None
 def get_system_prompt(prompt_type="tutor"):
-    """Seleziona il prompt di sistema appropriato"""
     return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
 def test_local_connection():
-    """Verifica la connessione al server LLM locale"""
     try:
         response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
         return response.status_code == 200
@@ -150,111 +47,108 @@ def read_metadata(db_path):
             return json.load(f)
     return []
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
-    """
-    Risponde alla domanda 'question' usando i documenti del database 'db_name'.
-    Restituisce una lista di 2 messaggi in formato:
-      [
-        {"role": "user", "content": <domanda>},
-        {"role": "assistant", "content": <risposta>}
-      ]
-    In questa versione, viene effettuato il log dei 'chunk' recuperati durante
-    la ricerca di similarità.
-    """
     if chat_history is None:
         chat_history = []
-    logging.info(f"Inizio elaborazione domanda: {question} per database: {db_name}")
     try:
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-        db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")  # Percorso corretto
-        logging.info(f"Verifico esistenza database in: {db_path}")
-        if not os.path.exists(db_path):
-            logging.warning(f"Database {db_name} non trovato in {db_path}")
             return [
                 {"role": "user", "content": question},
-                {"role": "assistant", "content": f"Database non trovato in {db_path}"}
             ]
-        # Carica l'indice FAISS
-        vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
-        # Cerca i documenti (chunk) più simili
-        relevant_docs = vectorstore.similarity_search(question, k=5)
-        metadata_list = read_metadata(db_path)
-        metadata_dict = {m["filename"]: m for m in metadata_list}
-        # Logga i chunk recuperati con metadata
-        for idx, doc in enumerate(relevant_docs):
-            logging.info(f"--- Chunk {idx+1} ---")
             source_file = doc.metadata.get("source", "Unknown")
-            # Recupera i metadata dal file json
             if source_file in metadata_dict:
-                file_metadata = metadata_dict[source_file]
-                logging.info(f"📚 Titolo: {file_metadata['title']}")
-                logging.info(f"✍️ Autore: {file_metadata['author']}")
-            logging.info(f"📄 Contenuto:")
-            logging.info(doc.page_content)
-            logging.info("---------------------")
-        # Prepara il contesto dai documenti
-        context = "\n".join([doc.page_content for doc in relevant_docs])
         prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
-        # Prepara la cronologia completa delle conversazioni
-        conversation_history = []
-        for msg in chat_history:  # Rimuovo limite di 4 messaggi
-            conversation_history.append({
-                "role": msg["role"],
-                "content": msg["content"]
-            })
-        # Costruisci messaggio con contesto completo
         messages = [
             {"role": "system", "content": prompt},
-            *conversation_history,  # Includi tutta la cronologia
             {"role": "user", "content": question}
         ]
-        if "local" in str(llm_type):
-            if not test_local_connection():
-                raise ConnectionError("LM Studio non raggiungibile")
         client, model = get_llm_client(llm_type)
         response = client.chat.completions.create(
             model=model,
             messages=messages,
             temperature=0.7,
-            max_tokens=2048  # Aumenta token per gestire conversazioni lunghe
         )
-        answer = response.choices[0].message.content
-        # Genera audio per domanda e risposta
-        user_audio = generate_speech(question, is_user=True)
-        assistant_audio = generate_speech(answer, is_user=False)
         return [
-            {"role": "user", "content": question, "audio": user_audio},
-            {"role": "assistant", "content": answer, "audio": assistant_audio}
         ]
     except Exception as e:
-        logging.error(f"Errore durante la generazione della risposta: {e}")
-        error_msg = "LLM locale non disponibile. Riprova più tardi o usa OpenAI." if "local" in str(llm_type) else str(e)
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": f"⚠️ {error_msg}"}
         ]
 if __name__ == "__main__":
-    pass

+# llm_handling.py
 import logging
 import os
 from langchain_community.vectorstores import FAISS
 import requests
 from tenacity import retry, stop_after_attempt, wait_exponential
+import json
+from app.config import BASE_DB_PATH  # Ensure correct import
+from app.config import LLM_CONFIGS, LLMType  # Import LLMType and LLM_CONFIGS
 from app.configs.prompts import SYSTEM_PROMPTS
+from app.utils.embedding_utils import get_embeddings
+from app.utils.voice_utils import generate_speech  # Retain import if needed
 logging.basicConfig(level=logging.INFO)
+# =====================================
+# Functions related to LLM
+# =====================================
 def get_llm_client(llm_type: LLMType):
+    """Obtains the appropriate client for the selected model"""
     config = LLM_CONFIGS.get(llm_type)
     if not config:
+        raise ValueError(f"Model {llm_type} not supported")
+    client_class = config["client"]
+    model = config["model"]
+    client = client_class()  # Ensure no arguments are needed
+    return client, model
 def get_system_prompt(prompt_type="tutor"):
+    """Selects the appropriate system prompt"""
     return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
 def test_local_connection():
+    """Checks connection to the local LLM server"""
     try:
         response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
         return response.status_code == 200
             return json.load(f)
     return []
+def get_relevant_documents(vectorstore, question, min_similarity=0.7):
+    """Retrieves relevant documents from the vectorstore"""
+    try:
+        enhanced_query = enhance_query(question)
+        docs_and_scores = vectorstore.similarity_search_with_score(
+            enhanced_query,
+            k=8
+        )
+        filtered_docs = [
+            doc for doc, score in docs_and_scores if score >= min_similarity
+        ]
+        logging.info(f"Query: {question}")
+        logging.info(f"Documents found: {len(filtered_docs)}")
+        return filtered_docs[:5] if filtered_docs else []
+    except Exception as e:
+        logging.error(f"Error retrieving documents: {e}")
+        return []
+def enhance_query(question):
+    stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
+    words = [w for w in question.lower().split() if w not in stop_words]
+    enhanced_query = " ".join(words)
+    return enhanced_query
+def log_search_results(question, docs_and_scores):
+    logging.info(f"Query: {question}")
+    for idx, (doc, score) in enumerate(docs_and_scores, 1):
+        logging.info(f"Doc {idx} - Score: {score:.4f}")
+        logging.info(f"Content: {doc.page_content[:100]}...")
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=None):
     if chat_history is None:
         chat_history = []
     try:
+        embeddings = get_embeddings()
+        db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+        metadata_list = read_metadata(db_path)
+        metadata_dict = {m["filename"]: m for m in metadata_list}
+        vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
+        relevant_docs = get_relevant_documents(vectorstore, question)
+        if not relevant_docs:
             return [
                 {"role": "user", "content": question},
+                {"role": "assistant", "content": "Sorry, no relevant information found to answer your question. Try rephrasing or asking a different question."}
             ]
+        sources = []
+        for idx, doc in enumerate(relevant_docs, 1):
             source_file = doc.metadata.get("source", "Unknown")
             if source_file in metadata_dict:
+                meta = metadata_dict[source_file]
+                sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Part {idx} of {len(relevant_docs)}")
+        context = "\n".join([
+            f"[Part {idx+1} of {len(relevant_docs)}]\n{doc.page_content}"
+            for idx, doc in enumerate(relevant_docs)
+        ])
+        sources_text = "\n\nSources consulted:\n" + "\n".join(set(sources))
         prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
+        prompt += "\nAlways cite the sources used for your response, including the document title and author."
         messages = [
             {"role": "system", "content": prompt},
+            *[{"role": m["role"], "content": m["content"]} for m in chat_history],
             {"role": "user", "content": question}
         ]
         client, model = get_llm_client(llm_type)
         response = client.chat.completions.create(
             model=model,
             messages=messages,
             temperature=0.7,
+            max_tokens=2048
         )
+        answer = response.choices[0].message.content + sources_text
         return [
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": answer}
         ]
     except Exception as e:
+        logging.error(f"Error generating response: {e}")
+        error_msg = "Local LLM not available. Try again later or use OpenAI." if "local" in str(llm_type) else str(e)
         return [
             {"role": "user", "content": question},
             {"role": "assistant", "content": f"⚠️ {error_msg}"}
         ]
+class DocumentRetriever:
+    def __init__(self, db_path):
+        self.embeddings = get_embeddings()
+        self.vectorstore = FAISS.load_local(
+            db_path,
+            self.embeddings,
+            allow_dangerous_deserialization=True
+        )
+    def get_relevant_chunks(self, question):
+        enhanced_query = enhance_query(question)
+        docs_and_scores = self.vectorstore.similarity_search_with_score(
+            enhanced_query,
+            k=8
+        )
+        log_search_results(question, docs_and_scores)
+        # Implement _filter_relevant_docs or remove the call
+        # return self._filter_relevant_docs(docs_and_scores)
 if __name__ == "__main__":
+    pass

app/llm_handling2.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# llm_handling.py
+import logging
+import os
+from langchain_community.vectorstores import FAISS
+import requests
+from tenacity import retry, stop_after_attempt, wait_exponential
+import json
+from app.config import *
+from app.configs.prompts import SYSTEM_PROMPTS
+from app.utils.embedding_utils import get_embeddings
+from app.utils.voice_utils import generate_speech
+logging.basicConfig(level=logging.INFO)
+# =====================================
+# Funzioni relative al LLM
+# =====================================
+def get_llm_client(llm_type: LLMType):
+    """Ottiene il client appropriato per il modello selezionato"""
+    config = LLM_CONFIGS.get(llm_type)
+    if not config:
+        raise ValueError(f"Modello {llm_type} non supportato")
+    return config["client"](), config["model"]
+def get_system_prompt(prompt_type="tutor"):
+    """Seleziona il prompt di sistema appropriato"""
+    return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
+def test_local_connection():
+    """Verifica la connessione al server LLM locale"""
+    try:
+        response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
+        return response.status_code == 200
+    except:
+        return False
+def read_metadata(db_path):
+    metadata_file = os.path.join(db_path, "metadata.json")
+    if os.path.exists(metadata_file):
+        with open(metadata_file, 'r') as f:
+            return json.load(f)
+    return []
+def get_relevant_documents(vectorstore, question, min_similarity=0.7):
+    """Recupera i documenti rilevanti dal vectorstore"""
+    try:
+        # Migliora la query prima della ricerca
+        enhanced_query = enhance_query(question)
+        # Ottieni documenti con punteggi di similarità
+        docs_and_scores = vectorstore.similarity_search_with_score(
+            enhanced_query,
+            k=8  # Aumenta il numero di documenti recuperati
+        )
+        # Filtra i documenti per similarità
+        filtered_docs = [
+            doc for doc, score in docs_and_scores
+            if score >= min_similarity
+        ]
+        # Log dei risultati per debug
+        logging.info(f"Query: {question}")
+        logging.info(f"Documenti trovati: {len(filtered_docs)}")
+        # Restituisci almeno un documento o una lista vuota
+        return filtered_docs[:5] if filtered_docs else []
+    except Exception as e:
+        logging.error(f"Errore nel recupero dei documenti: {e}")
+        return []  # Restituisce lista vuota invece di None
+def enhance_query(question):
+    # Rimuovi parole non significative
+    stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
+    words = [w for w in question.lower().split() if w not in stop_words]
+    # Estrai keywords chiave
+    enhanced_query = " ".join(words)
+    return enhanced_query
+def log_search_results(question, docs_and_scores):
+    logging.info(f"Query: {question}")
+    for idx, (doc, score) in enumerate(docs_and_scores, 1):
+        logging.info(f"Doc {idx} - Score: {score:.4f}")
+        logging.info(f"Content: {doc.page_content[:100]}...")
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
+    if chat_history is None:
+        chat_history = []
+    try:
+        embeddings = get_embeddings()  # Usa la funzione comune
+        db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+        # Leggi i metadati
+        metadata_list = read_metadata(db_path)
+        metadata_dict = {m["filename"]: m for m in metadata_list}
+        # Recupera i documenti rilevanti
+        vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
+        relevant_docs = get_relevant_documents(vectorstore, question)
+        if not relevant_docs:
+            return [
+                {"role": "user", "content": question},
+                {"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti per rispondere alla tua domanda. Prova a riformularla o a fare una domanda diversa."}
+            ]
+        # Prepara le citazioni delle fonti con numerazione dei chunk
+        sources = []
+        for idx, doc in enumerate(relevant_docs, 1):
+            source_file = doc.metadata.get("source", "Unknown")
+            if source_file in metadata_dict:
+                meta = metadata_dict[source_file]
+                sources.append(f"📚 {meta['title']} (Autore: {meta['author']}) - Parte {idx} di {len(relevant_docs)}")
+        # Prepara il contesto con le fonti
+        context = "\n".join([
+            f"[Parte {idx+1} di {len(relevant_docs)}]\n{doc.page_content}"
+            for idx, doc in enumerate(relevant_docs)
+        ])
+        sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
+        # Aggiorna il prompt per includere la richiesta di citare le fonti
+        prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
+        prompt += "\nCita sempre le fonti utilizzate per la tua risposta includendo il titolo del documento e l'autore."
+        # Costruisci il messaggio completo
+        messages = [
+            {"role": "system", "content": prompt},
+            *[{"role": m["role"], "content": m["content"]} for m in chat_history],
+            {"role": "user", "content": question}
+        ]
+        # Ottieni la risposta dall'LLM
+        client, model = get_llm_client(llm_type)
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=0.7,
+            max_tokens=2048
+        )
+        answer = response.choices[0].message.content + sources_text
+        # return [
+        #     {"role": "user", "content": question, "audio": user_audio},
+        #     {"role": "assistant", "content": answer, "audio": assistant_audio}
+        # ]
+    except Exception as e:
+        logging.error(f"Errore durante la generazione della risposta: {e}")
+        error_msg = "LLM locale non disponibile. Riprova più tardi o usa OpenAI." if "local" in str(llm_type) else str(e)
+        return [
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": f"⚠️ {error_msg}"}
+        ]
+class DocumentRetriever:
+    def __init__(self, db_path):
+        self.embeddings = get_embeddings()
+        self.vectorstore = FAISS.load_local(
+            db_path,
+            self.embeddings,
+            allow_dangerous_deserialization=True
+        )
+    def get_relevant_chunks(self, question):
+        enhanced_query = enhance_query(question)
+        docs_and_scores = self.vectorstore.similarity_search_with_score(
+            enhanced_query,
+            k=8
+        )
+        log_search_results(question, docs_and_scores)
+        return self._filter_relevant_docs(docs_and_scores)
+if __name__ == "__main__":
+    pass

{utils → app/utils}/__init__.py RENAMED Viewed

File without changes

app/utils/dataclass_utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import json
+from dataclasses import dataclass
+from app.functions.database_handling import BASE_DB_PATH
+@dataclass
+class DocumentMetadata:
+    """
+    Classe per gestire i metadati dei documenti.
+    Attributi:
+        filename (str): Nome del file originale
+        title (str): Titolo assegnato al documento
+        author (str): Autore del documento
+        upload_date (str): Data di caricamento
+        chunks (int): Numero di chunks in cui è stato diviso il documento
+    """
+    filename: str
+    title: str
+    author: str
+    upload_date: str
+    chunks: int
+    def to_dict(self):
+        """Converte i metadati in un dizionario per il salvataggio JSON."""
+        return {
+            "filename": self.filename,
+            "title": self.title,
+            "author": self.author,
+            "upload_date": self.upload_date,
+            "chunks": self.chunks
+        }
+def save_metadata(metadata_list, db_name):
+    """
+    Salva i metadati dei documenti nel database specificato.
+    Args:
+        metadata_list: Lista di oggetti DocumentMetadata da salvare
+        db_name: Nome del database in cui salvare i metadati
+    Note:
+        I metadati vengono salvati in un file JSON nella directory del database
+    """
+    db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
+    metadata_file = os.path.join(db_path, "metadata.json")
+    if not os.path.exists(db_path):
+        os.makedirs(db_path)
+    existing_metadata = []
+    if os.path.exists(metadata_file):
+        with open(metadata_file, 'r') as f:
+            existing_metadata = json.load(f)
+    existing_metadata.extend([m.to_dict() for m in metadata_list])
+    with open(metadata_file, 'w') as f:
+        json.dump(existing_metadata, f, indent=2)

app/utils/embedding_utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from app.config import  EMBEDDING_CONFIG
+def get_embeddings():
+    """Inizializza gli embeddings usando il modello configurato"""
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    return HuggingFaceEmbeddings(
+        model_name=EMBEDDING_CONFIG["model_name"],
+        model_kwargs={'device': device}
+    )

{utils → app/utils}/helpers.py RENAMED Viewed

File without changes

app/utils/markdowns_utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import re
+# =====================================
+# Funzioni relative al Markdown
+# =====================================
+def clean_markdown(text):
+    """Rimuove markdown dal testo"""
+    text = re.sub(r'```[\s\S]*?```', '', text)  # blocchi codice
+    text = re.sub(r'`.*?`', '', text)           # codice inline
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # link
+    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # bold
+    text = re.sub(r'\*(.*?)\*', r'\1', text)      # italic
+    return text.strip()

app/utils/voice_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+import asyncio
+import edge_tts
+from app.config import VOICE_USER, VOICE_ASSISTANT
+from pathlib import Path
+from app.utils.markdowns_utils import clean_markdown
+async def text_to_speech(text, voice_name, output_file):
+    """Genera audio usando edge-tts"""
+    communicate = edge_tts.Communicate(text, voice_name)
+    await communicate.save(output_file)
+def generate_speech(text, is_user=True):
+    try:
+        # Crea directory per audio temporanei
+        audio_dir = Path("temp_audio")
+        audio_dir.mkdir(exist_ok=True)
+        # Seleziona voce e genera nome file
+        voice = VOICE_USER if is_user else VOICE_ASSISTANT
+        file_name = f"speech_{hash(text)}.mp3"
+        output_path = audio_dir / file_name
+        # Genera audio
+        asyncio.run(text_to_speech(text, voice, str(output_path)))
+        return str(output_path)
+    except Exception as e:
+        logging.error(f"Errore TTS: {e}")
+        return None
+def generate_chat_audio(chat_history):
+    """Genera audio della conversazione con voci alternate"""
+    try:
+        audio_files = []
+        audio_dir = Path("temp_audio")
+        audio_dir.mkdir(exist_ok=True)
+        # Genera audio per ogni messaggio
+        for msg in chat_history:
+            content = clean_markdown(msg["content"])
+            if not content.strip():
+                continue
+            voice = VOICE_USER if msg["role"] == "user" else VOICE_ASSISTANT
+            file_name = f"chat_{msg['role']}_{hash(content)}.mp3"
+            output_path = audio_dir / file_name
+            # Genera audio senza prefissi
+            asyncio.run(text_to_speech(content, voice, str(output_path)))
+            audio_files.append(str(output_path))
+        # Combina tutti gli audio
+        if audio_files:
+            from pydub import AudioSegment
+            combined = AudioSegment.empty()
+            for audio_file in audio_files:
+                segment = AudioSegment.from_mp3(audio_file)
+                combined += segment
+            final_path = audio_dir / f"chat_complete_{hash(str(chat_history))}.mp3"
+            combined.export(str(final_path), format="mp3")
+            return str(final_path)
+        return None
+    except Exception as e:
+        logging.error(f"Errore generazione audio: {e}")
+        return None

db/.DS_Store CHANGED Viewed

Binary files a/db/.DS_Store and b/db/.DS_Store differ

db/faiss_index/index.faiss DELETED Viewed

Binary file (1.58 kB)

db/faiss_index/index.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:407d95e0808ddf251e3fb442241edd72c47961f5a38d5546021ef205b9fdeb57
-size 960117

db/faiss_index_Daniele2/.DS_Store DELETED Viewed

Binary file (6.15 kB)

db/faiss_index_Daniele2/index.faiss DELETED Viewed

Binary file (3.12 kB)

db/faiss_index_Daniele2/index.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e79bcca55b5153ea71218a3d2204c01ec1eccf59162fd4547d19956a4750d04e
-size 2958

db/faiss_index_Daniele2/metadata.json DELETED Viewed

@@ -1,9 +0,0 @@
-[
-  {
-    "filename": "istruzioni obiettivi di apprendimento.pdf",
-    "title": "Obiettivi di apprendimento",
-    "author": "Daniele",
-    "upload_date": "2025-01-02 15:14:19",
-    "chunks": 2
-  }
-]

db/faiss_index_Orienta/index.faiss DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d4ffbc57fcbef553e507d44c7708b4e23b947f5af97c13d97359f3d814fc562a
-size 2362413

db/faiss_index_Orienta/index.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a89e4e492e28b30ef9ede1fd176dfcdac5e973884a56ab9d217a695136be8349
-size 3303433

db/faiss_index_Orienta/metadata.json DELETED Viewed

@@ -1,9 +0,0 @@
-[
-  {
-    "filename": "Imparare a dirigere se stessi.pdf",
-    "title": "Imparare a dirigere se stessi ",
-    "author": "Pellerey",
-    "upload_date": "2025-01-02 22:47:28",
-    "chunks": 1538
-  }
-]

db/faiss_index_default_db/index.faiss DELETED Viewed

Binary file (309 kB)

db/faiss_index_default_db/index.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4c797df1c4a8ddac75b4b083391220179ce5bbcd2b962b4dfbc7d960628cd0b2
-size 107706

ui/chatbot_tab.py CHANGED Viewed

@@ -4,12 +4,13 @@ import logging
 import gradio as gr
 from app.functions.database_handling import list_databases
 from app.configs.prompts import SYSTEM_PROMPTS
-from app.llm_handling import answer_question, LLMType, generate_chat_audio
-from utils.helpers import extract_text_from_files
-logging.basicConfig(level=logging.INFO)
 def create_chatbot_tab():
@@ -87,26 +88,26 @@ def create_chatbot_tab():
         return str(Path(temp_path).absolute())
-    def download_audio(chat_history):
-        """Scarica l'ultimo messaggio audio dalla chat"""
-        try:
-            if not chat_history:
-                gr.Warning("Nessun messaggio nella chat")
-                return None
-            # Prendi l'ultimo messaggio assistant
-            for msg in reversed(chat_history):
-                if msg["role"] == "assistant" and "audio" in msg:
-                    audio_path = msg["audio"]
-                    if audio_path and os.path.exists(audio_path):
-                        return audio_path
-            gr.Warning("Nessun audio disponibile per l'ultima risposta")
-            return None
-        except Exception as e:
-            gr.Error(f"Errore durante il download dell'audio: {str(e)}")
-            return None
     def format_conversation_for_audio(chat_history):
         """Formatta la conversazione per la sintesi vocale"""
@@ -116,25 +117,25 @@ def create_chatbot_tab():
             audio_text.append(f"{role} dice: {msg['content']}")
         return "\n".join(audio_text)
-    def generate_conversation_audio(chat_history):
-        """Genera audio della conversazione completa"""
-        try:
-            if not chat_history:
-                gr.Warning("Nessun messaggio nella chat")
-                return None
-            conversation_text = format_conversation_for_audio(chat_history)
-            audio_path = generate_speech(conversation_text, is_user=False)
-            if audio_path and os.path.exists(audio_path):
-                return audio_path
-            else:
-                gr.Warning("Errore nella generazione dell'audio")
-                return None
-        except Exception as e:
-            gr.Error(f"Errore: {str(e)}")
-            return None
     def convert_chat_to_audio(chat_history):
         if not chat_history:

 import gradio as gr
 from app.functions.database_handling import list_databases
 from app.configs.prompts import SYSTEM_PROMPTS
+from app.llm_handling import answer_question, LLMType
+from app.utils.helpers import extract_text_from_files
+from app.utils.voice_utils import *
+from app.utils.markdowns_utils import clean_markdown
+logging.basicConfig(level=logging.INFO)
 def create_chatbot_tab():
         return str(Path(temp_path).absolute())
+    # def download_audio(chat_history):
+    #     """Scarica l'ultimo messaggio audio dalla chat"""
+    #     try:
+    #         if not chat_history:
+    #             gr.Warning("Nessun messaggio nella chat")
+    #             return None
+    #         # Prendi l'ultimo messaggio assistant
+    #         for msg in reversed(chat_history):
+    #             if msg["role"] == "assistant" and "audio" in msg:
+    #                 audio_path = msg["audio"]
+    #                 if audio_path and os.path.exists(audio_path):
+    #                     return audio_path
+    #         gr.Warning("Nessun audio disponibile per l'ultima risposta")
+    #         return None
+    #     except Exception as e:
+    #         gr.Error(f"Errore durante il download dell'audio: {str(e)}")
+            # return None
     def format_conversation_for_audio(chat_history):
         """Formatta la conversazione per la sintesi vocale"""
             audio_text.append(f"{role} dice: {msg['content']}")
         return "\n".join(audio_text)
+    # def generate_conversation_audio(chat_history):
+    #     """Genera audio della conversazione completa"""
+    #     try:
+    #         if not chat_history:
+    #             gr.Warning("Nessun messaggio nella chat")
+    #             return None
+    #         conversation_text = format_conversation_for_audio(chat_history)
+    #         audio_path = generate_speech(conversation_text, is_user=False)
+    #         if audio_path and os.path.exists(audio_path):
+    #             return audio_path
+    #         else:
+    #             gr.Warning("Errore nella generazione dell'audio")
+    #             return None
+    #     except Exception as e:
+    #         gr.Error(f"Errore: {str(e)}")
+    #         return None
     def convert_chat_to_audio(chat_history):
         if not chat_history:

ui/new_features_tab.py DELETED Viewed

@@ -1,43 +0,0 @@
-# ui/new_features_tab.py
-import gradio as gr
-from app.document_handling import search_documents
-from app.functions.database_handling import list_databases
-def create_new_features_tab():
-    """Crea il tab 'Nuove Funzionalità' dell'interfaccia Gradio."""
-    def search_documents_callback(query, db_name):
-        """Cerca documenti nel database in base alla query."""
-        results = search_documents(query, db_name)
-        return "\n".join(results)
-    # Ottieni la lista dei database
-    databases = list_databases()
-    with gr.Tab("Nuove Funzionalità"):
-        gr.Markdown("## Cerca Documenti e Genera Riassunto")
-        db_name_new = gr.Dropdown(choices=databases, label="Seleziona Database", value="default_db")
-        search_input = gr.Textbox(label="Inserisci Termini di Ricerca")
-        search_button = gr.Button("Cerca Documenti")
-        search_output = gr.Textbox(label="Documenti Trovati")
-        summary_button = gr.Button("Genera Riassunto")
-        summary_output = gr.Textbox(label="Riassunto")
-        # Evento per il bottone di ricerca
-        search_button.click(
-            search_documents_callback,
-            inputs=[search_input, db_name_new],
-            outputs=search_output
-        )
-        # Evento per il bottone di generazione riassunto (implementare generate_summary se necessario)
-        # summary_button.click(
-        #     generate_summary,
-        #     inputs=db_name_new,
-        #     outputs=summary_output
-        # )
-    return