Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Nugh75 commited on Jan 4

Commit

3c5ed5b

1 Parent(s): f622cac

update struttura

Browse files

Files changed (13) hide show

app.py +1 -1
app/document_handling.py +3 -60
app/functions/__init__.py +0 -0
app/llm_handling.py +1 -1
app/llm_handling2.py +0 -188
app/{functions → utils}/database_handling.py +0 -0
app/utils/dataclass_utils.py +1 -1
app/utils/embedding_utils.py +18 -0
app/utils/extract_utils.py +35 -0
ui/chatbot_tab.py +1 -1
ui/db_management_tab.py +1 -1
ui/document_management_tab.py +1 -1
ui/management_tabs.py +1 -1

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import gradio as gr
 import logging
 from app.logging_config import configure_logging
-from app.functions.database_handling import list_databases
 from ui.chatbot_tab import create_chatbot_tab
 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab

 import gradio as gr
 import logging
 from app.logging_config import configure_logging
+from app.utils.database_handling import list_databases
 from ui.chatbot_tab import create_chatbot_tab
 from ui.db_management_tab import create_db_management_tab
 from ui.document_management_tab import create_document_management_tab

app/document_handling.py CHANGED Viewed

@@ -1,70 +1,13 @@
 import logging
-import gradio as gr
 from langchain_community.vectorstores import FAISS
 import os
-import PyPDF2
-from docx import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 import json
 from datetime import datetime
-from app.functions.database_handling import BASE_DB_PATH
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
-from app.utils.embedding_utils import get_embeddings
 from app.utils.dataclass_utils import DocumentMetadata, save_metadata
-# -------------- UTILITY FUNCTIONS --------------
-def extract_text_from_pdf(file_path):
-    """
-    Estrae il testo da un file PDF.
-    Args:
-        file_path: Percorso del file PDF
-    Returns:
-        str: Testo estratto dal PDF
-    """
-    with open(file_path, 'rb') as f:
-        reader = PyPDF2.PdfReader(f)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
-        return text
-def extract_text_from_docx(file_path):
-    """
-    Estrae il testo da un file DOCX.
-    Args:
-        file_path: Percorso del file DOCX
-    Returns:
-        str: Testo estratto dal documento Word
-    """
-    doc = Document(file_path)
-    text = ""
-    for para in doc.paragraphs:
-        text += para.text + "\n"
-    return text
-def create_chunks(text):
-    from app.config import EMBEDDING_CONFIG
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=EMBEDDING_CONFIG["chunk_size"],
-        chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
-        length_function=len,
-        separators=["\n\n", "\n", " ", ""]
-    )
-    return text_splitter.split_text(text)
-def create_vectorstore(texts, metadatas, db_path):
-    embeddings = get_embeddings()
-    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

 import logging
 from langchain_community.vectorstores import FAISS
 import os
 import json
 from datetime import datetime
+from app.utils.database_handling import BASE_DB_PATH
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from app.utils.embedding_utils import *
 from app.utils.dataclass_utils import DocumentMetadata, save_metadata
+from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx
 # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------

app/functions/__init__.py DELETED Viewed

File without changes

app/llm_handling.py CHANGED Viewed

@@ -10,7 +10,7 @@ from app.config import BASE_DB_PATH  # Ensure correct import
 from app.config import LLM_CONFIGS, LLMType  # Import LLMType and LLM_CONFIGS
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.utils.embedding_utils import get_embeddings
-from app.utils.voice_utils import generate_speech  # Retain import if needed
 logging.basicConfig(level=logging.INFO)

 from app.config import LLM_CONFIGS, LLMType  # Import LLMType and LLM_CONFIGS
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.utils.embedding_utils import get_embeddings
 logging.basicConfig(level=logging.INFO)

app/llm_handling2.py DELETED Viewed

@@ -1,188 +0,0 @@
-# llm_handling.py
-import logging
-import os
-from langchain_community.vectorstores import FAISS
-import requests
-from tenacity import retry, stop_after_attempt, wait_exponential
-import json
-from app.config import *
-from app.configs.prompts import SYSTEM_PROMPTS
-from app.utils.embedding_utils import get_embeddings
-from app.utils.voice_utils import generate_speech
-logging.basicConfig(level=logging.INFO)
-# =====================================
-# Funzioni relative al LLM
-# =====================================
-def get_llm_client(llm_type: LLMType):
-    """Ottiene il client appropriato per il modello selezionato"""
-    config = LLM_CONFIGS.get(llm_type)
-    if not config:
-        raise ValueError(f"Modello {llm_type} non supportato")
-    return config["client"](), config["model"]
-def get_system_prompt(prompt_type="tutor"):
-    """Seleziona il prompt di sistema appropriato"""
-    return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
-def test_local_connection():
-    """Verifica la connessione al server LLM locale"""
-    try:
-        response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
-        return response.status_code == 200
-    except:
-        return False
-def read_metadata(db_path):
-    metadata_file = os.path.join(db_path, "metadata.json")
-    if os.path.exists(metadata_file):
-        with open(metadata_file, 'r') as f:
-            return json.load(f)
-    return []
-def get_relevant_documents(vectorstore, question, min_similarity=0.7):
-    """Recupera i documenti rilevanti dal vectorstore"""
-    try:
-        # Migliora la query prima della ricerca
-        enhanced_query = enhance_query(question)
-        # Ottieni documenti con punteggi di similarità
-        docs_and_scores = vectorstore.similarity_search_with_score(
-            enhanced_query,
-            k=8  # Aumenta il numero di documenti recuperati
-        )
-        # Filtra i documenti per similarità
-        filtered_docs = [
-            doc for doc, score in docs_and_scores
-            if score >= min_similarity
-        ]
-        # Log dei risultati per debug
-        logging.info(f"Query: {question}")
-        logging.info(f"Documenti trovati: {len(filtered_docs)}")
-        # Restituisci almeno un documento o una lista vuota
-        return filtered_docs[:5] if filtered_docs else []
-    except Exception as e:
-        logging.error(f"Errore nel recupero dei documenti: {e}")
-        return []  # Restituisce lista vuota invece di None
-def enhance_query(question):
-    # Rimuovi parole non significative
-    stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
-    words = [w for w in question.lower().split() if w not in stop_words]
-    # Estrai keywords chiave
-    enhanced_query = " ".join(words)
-    return enhanced_query
-def log_search_results(question, docs_and_scores):
-    logging.info(f"Query: {question}")
-    for idx, (doc, score) in enumerate(docs_and_scores, 1):
-        logging.info(f"Doc {idx} - Score: {score:.4f}")
-        logging.info(f"Content: {doc.page_content[:100]}...")
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
-    if chat_history is None:
-        chat_history = []
-    try:
-        embeddings = get_embeddings()  # Usa la funzione comune
-        db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
-        # Leggi i metadati
-        metadata_list = read_metadata(db_path)
-        metadata_dict = {m["filename"]: m for m in metadata_list}
-        # Recupera i documenti rilevanti
-        vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
-        relevant_docs = get_relevant_documents(vectorstore, question)
-        if not relevant_docs:
-            return [
-                {"role": "user", "content": question},
-                {"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti per rispondere alla tua domanda. Prova a riformularla o a fare una domanda diversa."}
-            ]
-        # Prepara le citazioni delle fonti con numerazione dei chunk
-        sources = []
-        for idx, doc in enumerate(relevant_docs, 1):
-            source_file = doc.metadata.get("source", "Unknown")
-            if source_file in metadata_dict:
-                meta = metadata_dict[source_file]
-                sources.append(f"📚 {meta['title']} (Autore: {meta['author']}) - Parte {idx} di {len(relevant_docs)}")
-        # Prepara il contesto con le fonti
-        context = "\n".join([
-            f"[Parte {idx+1} di {len(relevant_docs)}]\n{doc.page_content}"
-            for idx, doc in enumerate(relevant_docs)
-        ])
-        sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
-        # Aggiorna il prompt per includere la richiesta di citare le fonti
-        prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
-        prompt += "\nCita sempre le fonti utilizzate per la tua risposta includendo il titolo del documento e l'autore."
-        # Costruisci il messaggio completo
-        messages = [
-            {"role": "system", "content": prompt},
-            *[{"role": m["role"], "content": m["content"]} for m in chat_history],
-            {"role": "user", "content": question}
-        ]
-        # Ottieni la risposta dall'LLM
-        client, model = get_llm_client(llm_type)
-        response = client.chat.completions.create(
-            model=model,
-            messages=messages,
-            temperature=0.7,
-            max_tokens=2048
-        )
-        answer = response.choices[0].message.content + sources_text
-        # return [
-        #     {"role": "user", "content": question, "audio": user_audio},
-        #     {"role": "assistant", "content": answer, "audio": assistant_audio}
-        # ]
-    except Exception as e:
-        logging.error(f"Errore durante la generazione della risposta: {e}")
-        error_msg = "LLM locale non disponibile. Riprova più tardi o usa OpenAI." if "local" in str(llm_type) else str(e)
-        return [
-            {"role": "user", "content": question},
-            {"role": "assistant", "content": f"⚠️ {error_msg}"}
-        ]
-class DocumentRetriever:
-    def __init__(self, db_path):
-        self.embeddings = get_embeddings()
-        self.vectorstore = FAISS.load_local(
-            db_path,
-            self.embeddings,
-            allow_dangerous_deserialization=True
-        )
-    def get_relevant_chunks(self, question):
-        enhanced_query = enhance_query(question)
-        docs_and_scores = self.vectorstore.similarity_search_with_score(
-            enhanced_query,
-            k=8
-        )
-        log_search_results(question, docs_and_scores)
-        return self._filter_relevant_docs(docs_and_scores)
-if __name__ == "__main__":
-    pass

app/{functions → utils}/database_handling.py RENAMED Viewed

File without changes

app/utils/dataclass_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import json
 from dataclasses import dataclass
-from app.functions.database_handling import BASE_DB_PATH
 @dataclass
 class DocumentMetadata:

 import os
 import json
 from dataclasses import dataclass
+from app.utils.database_handling import BASE_DB_PATH
 @dataclass
 class DocumentMetadata:

app/utils/embedding_utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import torch
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from app.config import  EMBEDDING_CONFIG
 def get_embeddings():
     """Inizializza gli embeddings usando il modello configurato"""
@@ -9,4 +11,20 @@ def get_embeddings():
         model_name=EMBEDDING_CONFIG["model_name"],
         model_kwargs={'device': device}
     )

 import torch
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from app.config import  EMBEDDING_CONFIG
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
 def get_embeddings():
     """Inizializza gli embeddings usando il modello configurato"""
         model_name=EMBEDDING_CONFIG["model_name"],
         model_kwargs={'device': device}
     )
+def create_chunks(text):
+    from app.config import EMBEDDING_CONFIG
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=EMBEDDING_CONFIG["chunk_size"],
+        chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    return text_splitter.split_text(text)
+def create_vectorstore(texts, metadatas, db_path):
+    embeddings = get_embeddings()
+    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

app/utils/extract_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import PyPDF2
+from docx import Document
+def extract_text_from_pdf(file_path):
+    """
+    Estrae il testo da un file PDF.
+    Args:
+        file_path: Percorso del file PDF
+    Returns:
+        str: Testo estratto dal PDF
+    """
+    with open(file_path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+def extract_text_from_docx(file_path):
+    """
+    Estrae il testo da un file DOCX.
+    Args:
+        file_path: Percorso del file DOCX
+    Returns:
+        str: Testo estratto dal documento Word
+    """
+    doc = Document(file_path)
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text

ui/chatbot_tab.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 import gradio as gr
-from app.functions.database_handling import list_databases
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.llm_handling import answer_question, LLMType
 from app.utils.helpers import extract_text_from_files

 import logging
 import gradio as gr
+from app.utils.database_handling import list_databases
 from app.configs.prompts import SYSTEM_PROMPTS
 from app.llm_handling import answer_question, LLMType
 from app.utils.helpers import extract_text_from_files

ui/db_management_tab.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from app.functions.database_handling import create_database, modify_database, delete_database, list_databases
 def create_db_management_tab(dropdowns):
     databases = list_databases()

 import gradio as gr
+from app.utils.database_handling import create_database, modify_database, delete_database, list_databases
 def create_db_management_tab(dropdowns):
     databases = list_databases()

ui/document_management_tab.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import logging
 from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
-from app.functions.database_handling import list_databases
 def create_document_management_tab():
     """Crea il tab 'Gestione Documenti' dell'interfaccia Gradio."""

 import gradio as gr
 import logging
 from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
+from app.utils.database_handling import list_databases
 def create_document_management_tab():
     """Crea il tab 'Gestione Documenti' dell'interfaccia Gradio."""

ui/management_tabs.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import logging
 from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
-from app.functions.database_handling import (
     create_database,
     modify_database,
     delete_database,

 import gradio as gr
 import logging
 from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
+from app.utils.database_handling import (
     create_database,
     modify_database,
     delete_database,