Spaces:
Sleeping
Sleeping
moduliazione del programma
Browse filesOgni funzionalità ha il proprio file e è nella propria cartella, bisogna continuare a raffinare il programma
- app.py +1 -3
- app/config.py +48 -1
- app/document_handling.py +21 -109
- app/functions/database_handling.py +1 -3
- app/llm_handling.py +92 -198
- app/llm_handling2.py +188 -0
- {utils → app/utils}/__init__.py +0 -0
- app/utils/dataclass_utils.py +59 -0
- app/utils/embedding_utils.py +12 -0
- {utils → app/utils}/helpers.py +0 -0
- app/utils/markdowns_utils.py +14 -0
- app/utils/voice_utils.py +71 -0
- db/.DS_Store +0 -0
- db/faiss_index/index.faiss +0 -0
- db/faiss_index/index.pkl +0 -3
- db/faiss_index_Daniele2/.DS_Store +0 -0
- db/faiss_index_Daniele2/index.faiss +0 -0
- db/faiss_index_Daniele2/index.pkl +0 -3
- db/faiss_index_Daniele2/metadata.json +0 -9
- db/faiss_index_Orienta/index.faiss +0 -3
- db/faiss_index_Orienta/index.pkl +0 -3
- db/faiss_index_Orienta/metadata.json +0 -9
- db/faiss_index_default_db/index.faiss +0 -0
- db/faiss_index_default_db/index.pkl +0 -3
- ui/chatbot_tab.py +38 -37
- ui/new_features_tab.py +0 -43
app.py
CHANGED
@@ -7,7 +7,6 @@ from app.functions.database_handling import list_databases
|
|
7 |
from ui.chatbot_tab import create_chatbot_tab
|
8 |
from ui.db_management_tab import create_db_management_tab
|
9 |
from ui.document_management_tab import create_document_management_tab
|
10 |
-
from ui.new_features_tab import create_new_features_tab
|
11 |
from ui.info_tab import create_info_tab # Importa la nuova tab
|
12 |
|
13 |
# Configura il logging
|
@@ -39,8 +38,7 @@ def main():
|
|
39 |
chat_refs # Tab 4: Chatbot (ultima tab)
|
40 |
doc_refs # Tab 2: Document Management
|
41 |
db_refs(dropdowns)
|
42 |
-
|
43 |
-
create_new_features_tab() # Tab 3: Features
|
44 |
info_refs # Tab 5: Info (ultima tab)
|
45 |
|
46 |
rag_chatbot.launch()
|
|
|
7 |
from ui.chatbot_tab import create_chatbot_tab
|
8 |
from ui.db_management_tab import create_db_management_tab
|
9 |
from ui.document_management_tab import create_document_management_tab
|
|
|
10 |
from ui.info_tab import create_info_tab # Importa la nuova tab
|
11 |
|
12 |
# Configura il logging
|
|
|
38 |
chat_refs # Tab 4: Chatbot (ultima tab)
|
39 |
doc_refs # Tab 2: Document Management
|
40 |
db_refs(dropdowns)
|
41 |
+
|
|
|
42 |
info_refs # Tab 5: Info (ultima tab)
|
43 |
|
44 |
rag_chatbot.launch()
|
app/config.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
import os
|
2 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
# Carica le variabili d'ambiente dal file .env
|
5 |
load_dotenv()
|
@@ -7,4 +12,46 @@ load_dotenv()
|
|
7 |
# Configurazione del modello
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
9 |
if not OPENAI_API_KEY:
|
10 |
-
raise ValueError("OPENAI_API_KEY non trovata. Verifica il file .env")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from dotenv import load_dotenv
|
3 |
+
from enum import Enum
|
4 |
+
from openai import OpenAI
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
|
8 |
|
9 |
# Carica le variabili d'ambiente dal file .env
|
10 |
load_dotenv()
|
|
|
12 |
# Configurazione del modello
|
13 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
14 |
if not OPENAI_API_KEY:
|
15 |
+
raise ValueError("OPENAI_API_KEY non trovata. Verifica il file .env")
|
16 |
+
|
17 |
+
class LLMType(Enum):
|
18 |
+
OPENAI_GPT_4O_MINI = "openai - GPT-4o-mini"
|
19 |
+
LOCAL_QWEN = "local - Qwen 7B"
|
20 |
+
LOCAL_PHI = "local - Phi-3 Mini"
|
21 |
+
|
22 |
+
# Configurazione modelli
|
23 |
+
LLM_CONFIGS = {
|
24 |
+
LLMType.OPENAI_GPT_4O_MINI: {
|
25 |
+
"client": lambda: OpenAI(api_key=OPENAI_API_KEY),
|
26 |
+
"model": "gpt-4-mini",
|
27 |
+
"base_url": None
|
28 |
+
},
|
29 |
+
LLMType.LOCAL_QWEN: {
|
30 |
+
"client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
|
31 |
+
"model": "qwen2.5-coder-7b-instruct",
|
32 |
+
"base_url": "http://192.168.82.5:1234/v1"
|
33 |
+
},
|
34 |
+
LLMType.LOCAL_PHI: {
|
35 |
+
"client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
|
36 |
+
"model": "phi-3.5-mini-ita",
|
37 |
+
"base_url": "http://192.168.82.5:1234/v1"
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
EMBEDDING_CONFIG = {
|
42 |
+
"model_name": "sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
43 |
+
"chunk_size": 2000,
|
44 |
+
"chunk_overlap": 100,
|
45 |
+
"k_documents": 5,
|
46 |
+
"min_similarity": 0.7
|
47 |
+
}
|
48 |
+
|
49 |
+
# Aggiungi questa costante
|
50 |
+
EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
51 |
+
|
52 |
+
# Definisci il percorso base per i database
|
53 |
+
BASE_DB_PATH = "db"
|
54 |
+
|
55 |
+
# Voci italiane edge-tts
|
56 |
+
VOICE_USER = "it-IT-DiegoNeural" # Voce maschile utente
|
57 |
+
VOICE_ASSISTANT = "it-IT-ElsaNeural" # Voce femminile assistente
|
app/document_handling.py
CHANGED
@@ -1,79 +1,20 @@
|
|
1 |
import logging
|
2 |
import gradio as gr
|
3 |
from langchain_community.vectorstores import FAISS
|
4 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
import os
|
6 |
-
import shutil
|
7 |
import PyPDF2
|
8 |
from docx import Document
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
-
from dataclasses import dataclass
|
11 |
import json
|
12 |
from datetime import datetime
|
13 |
from app.functions.database_handling import BASE_DB_PATH
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
# Initialize the text splitter
|
16 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
|
17 |
|
18 |
# -------------- UTILITY FUNCTIONS --------------
|
19 |
-
@dataclass
|
20 |
-
class DocumentMetadata:
|
21 |
-
"""
|
22 |
-
Classe per gestire i metadati dei documenti.
|
23 |
-
|
24 |
-
Attributi:
|
25 |
-
filename (str): Nome del file originale
|
26 |
-
title (str): Titolo assegnato al documento
|
27 |
-
author (str): Autore del documento
|
28 |
-
upload_date (str): Data di caricamento
|
29 |
-
chunks (int): Numero di chunks in cui è stato diviso il documento
|
30 |
-
"""
|
31 |
-
filename: str
|
32 |
-
title: str
|
33 |
-
author: str
|
34 |
-
upload_date: str
|
35 |
-
chunks: int
|
36 |
-
|
37 |
-
def to_dict(self):
|
38 |
-
"""Converte i metadati in un dizionario per il salvataggio JSON."""
|
39 |
-
return {
|
40 |
-
"filename": self.filename,
|
41 |
-
"title": self.title,
|
42 |
-
"author": self.author,
|
43 |
-
"upload_date": self.upload_date,
|
44 |
-
"chunks": self.chunks
|
45 |
-
}
|
46 |
-
|
47 |
-
def save_metadata(metadata_list, db_name):
|
48 |
-
"""
|
49 |
-
Salva i metadati dei documenti nel database specificato.
|
50 |
-
|
51 |
-
Args:
|
52 |
-
metadata_list: Lista di oggetti DocumentMetadata da salvare
|
53 |
-
db_name: Nome del database in cui salvare i metadati
|
54 |
-
|
55 |
-
Note:
|
56 |
-
I metadati vengono salvati in un file JSON nella directory del database
|
57 |
-
"""
|
58 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
59 |
-
metadata_file = os.path.join(db_path, "metadata.json")
|
60 |
-
|
61 |
-
# Crea la directory se non esiste
|
62 |
-
if not os.path.exists(db_path):
|
63 |
-
os.makedirs(db_path)
|
64 |
-
|
65 |
-
# Carica metadati esistenti se presenti
|
66 |
-
existing_metadata = []
|
67 |
-
if os.path.exists(metadata_file):
|
68 |
-
with open(metadata_file, 'r') as f:
|
69 |
-
existing_metadata = json.load(f)
|
70 |
-
|
71 |
-
# Aggiungi nuovi metadati
|
72 |
-
existing_metadata.extend([m.to_dict() for m in metadata_list])
|
73 |
-
|
74 |
-
# Salva il file aggiornato
|
75 |
-
with open(metadata_file, 'w') as f:
|
76 |
-
json.dump(existing_metadata, f, indent=2)
|
77 |
|
78 |
def extract_text_from_pdf(file_path):
|
79 |
"""
|
@@ -108,34 +49,26 @@ def extract_text_from_docx(file_path):
|
|
108 |
text += para.text + "\n"
|
109 |
return text
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
118 |
-
if not os.path.exists(db_path):
|
119 |
-
logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
|
120 |
-
return "Database non trovato."
|
121 |
-
|
122 |
-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
123 |
-
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
124 |
-
|
125 |
-
# Perform a similarity search
|
126 |
-
docs = vectorstore.similarity_search(question)
|
127 |
-
|
128 |
-
if not docs:
|
129 |
-
return "Nessun documento corrispondente alla query."
|
130 |
-
|
131 |
-
# Collect the document contents
|
132 |
-
results = [doc.page_content for doc in docs]
|
133 |
-
return "\n\n".join(results)
|
134 |
-
|
135 |
|
136 |
|
137 |
|
138 |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
|
|
|
139 |
def upload_and_index(files, title, author, db_name="default_db"):
|
140 |
if not files:
|
141 |
return "Nessun file caricato."
|
@@ -154,7 +87,7 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
154 |
with open(file.name, 'r', encoding='utf-8') as f:
|
155 |
text = f.read()
|
156 |
|
157 |
-
chunks =
|
158 |
|
159 |
# Metadata per il documento
|
160 |
doc_meta = DocumentMetadata(
|
@@ -189,7 +122,8 @@ def upload_and_index(files, title, author, db_name="default_db"):
|
|
189 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
190 |
os.makedirs(db_path, exist_ok=True)
|
191 |
|
192 |
-
|
|
|
193 |
texts = [doc["content"] for doc in documents]
|
194 |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
|
195 |
|
@@ -265,26 +199,4 @@ def delete_file_from_database(file_name, db_name="default_db"):
|
|
265 |
except Exception as e:
|
266 |
return f"Errore durante la rimozione del file: {e}"
|
267 |
|
268 |
-
# -------------- NEW FEATURES TAB FUNCTIONS --------------
|
269 |
-
def search_documents(query, db_name="default_db"):
|
270 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
271 |
-
if not os.path.exists(db_path):
|
272 |
-
logging.warning(f"L'indice FAISS per il database '{db_name}' non esiste.")
|
273 |
-
return "Database non trovato."
|
274 |
-
|
275 |
-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
276 |
-
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
277 |
-
|
278 |
-
# Perform a similarity search
|
279 |
-
docs = vectorstore.similarity_search(query)
|
280 |
-
|
281 |
-
if not docs:
|
282 |
-
return "Nessun documento corrispondente alla query."
|
283 |
-
|
284 |
-
# Collect the document contents
|
285 |
-
results = [doc.page_content for doc in docs]
|
286 |
-
return "\n\n".join(results)
|
287 |
|
288 |
-
def generate_summary(db_name="default_db"):
|
289 |
-
# Placeholder per la logica di summarization
|
290 |
-
return "This is a summary of the documents in the database."
|
|
|
1 |
import logging
|
2 |
import gradio as gr
|
3 |
from langchain_community.vectorstores import FAISS
|
|
|
4 |
import os
|
|
|
5 |
import PyPDF2
|
6 |
from docx import Document
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
8 |
import json
|
9 |
from datetime import datetime
|
10 |
from app.functions.database_handling import BASE_DB_PATH
|
11 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
+
from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
|
13 |
+
from app.utils.embedding_utils import get_embeddings
|
14 |
+
from app.utils.dataclass_utils import DocumentMetadata, save_metadata
|
15 |
|
|
|
|
|
16 |
|
17 |
# -------------- UTILITY FUNCTIONS --------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def extract_text_from_pdf(file_path):
|
20 |
"""
|
|
|
49 |
text += para.text + "\n"
|
50 |
return text
|
51 |
|
52 |
+
def create_chunks(text):
|
53 |
+
from app.config import EMBEDDING_CONFIG
|
54 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
55 |
+
chunk_size=EMBEDDING_CONFIG["chunk_size"],
|
56 |
+
chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
|
57 |
+
length_function=len,
|
58 |
+
separators=["\n\n", "\n", " ", ""]
|
59 |
+
)
|
60 |
+
return text_splitter.split_text(text)
|
61 |
|
62 |
|
63 |
+
def create_vectorstore(texts, metadatas, db_path):
|
64 |
+
embeddings = get_embeddings()
|
65 |
+
db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
|
66 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
|
69 |
|
70 |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
|
71 |
+
|
72 |
def upload_and_index(files, title, author, db_name="default_db"):
|
73 |
if not files:
|
74 |
return "Nessun file caricato."
|
|
|
87 |
with open(file.name, 'r', encoding='utf-8') as f:
|
88 |
text = f.read()
|
89 |
|
90 |
+
chunks = create_chunks(text)
|
91 |
|
92 |
# Metadata per il documento
|
93 |
doc_meta = DocumentMetadata(
|
|
|
122 |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui
|
123 |
os.makedirs(db_path, exist_ok=True)
|
124 |
|
125 |
+
# Usa la funzione centralizzata invece dell'inizializzazione diretta
|
126 |
+
embeddings = get_embeddings()
|
127 |
texts = [doc["content"] for doc in documents]
|
128 |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
|
129 |
|
|
|
199 |
except Exception as e:
|
200 |
return f"Errore durante la rimozione del file: {e}"
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
|
|
|
app/functions/database_handling.py
CHANGED
@@ -3,9 +3,7 @@ import os
|
|
3 |
import shutil
|
4 |
from watchdog.observers import Observer
|
5 |
from watchdog.events import FileSystemEventHandler
|
6 |
-
|
7 |
-
# Definisci il percorso base per i database
|
8 |
-
BASE_DB_PATH = "db"
|
9 |
|
10 |
# Crea la cartella db se non esiste
|
11 |
if not os.path.exists(BASE_DB_PATH):
|
|
|
3 |
import shutil
|
4 |
from watchdog.observers import Observer
|
5 |
from watchdog.events import FileSystemEventHandler
|
6 |
+
from app.config import BASE_DB_PATH
|
|
|
|
|
7 |
|
8 |
# Crea la cartella db se non esiste
|
9 |
if not os.path.exists(BASE_DB_PATH):
|
app/llm_handling.py
CHANGED
@@ -1,142 +1,39 @@
|
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
import shutil
|
4 |
-
from enum import Enum
|
5 |
-
|
6 |
-
from openai import OpenAI
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
-
import gradio as gr
|
10 |
-
import asyncio
|
11 |
-
import edge_tts
|
12 |
-
from pathlib import Path
|
13 |
import requests
|
14 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
15 |
|
16 |
-
from app.config import
|
17 |
-
from app.
|
18 |
from app.configs.prompts import SYSTEM_PROMPTS
|
19 |
-
|
20 |
-
import
|
21 |
|
22 |
logging.basicConfig(level=logging.INFO)
|
23 |
-
local_ip="192.168.82.5:1234"
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
LOCAL_PHI = "local - Phi-3 Mini"
|
29 |
-
|
30 |
-
# Configurazione modelli
|
31 |
-
LLM_CONFIGS = {
|
32 |
-
LLMType.OPENAI_GPT_4O_MINI: {
|
33 |
-
"client": lambda: OpenAI(api_key=OPENAI_API_KEY),
|
34 |
-
"model": "gpt-4-mini",
|
35 |
-
"base_url": None
|
36 |
-
},
|
37 |
-
LLMType.LOCAL_QWEN: {
|
38 |
-
"client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
|
39 |
-
"model": "qwen2.5-coder-7b-instruct",
|
40 |
-
"base_url": "http://192.168.82.5:1234/v1"
|
41 |
-
},
|
42 |
-
LLMType.LOCAL_PHI: {
|
43 |
-
"client": lambda: OpenAI(base_url="http://192.168.82.5:1234/v1", api_key="not-needed"),
|
44 |
-
"model": "phi-3.5-mini-ita",
|
45 |
-
"base_url": "http://192.168.82.5:1234/v1"
|
46 |
-
}
|
47 |
-
}
|
48 |
|
49 |
def get_llm_client(llm_type: LLMType):
|
50 |
-
"""
|
51 |
config = LLM_CONFIGS.get(llm_type)
|
52 |
if not config:
|
53 |
-
raise ValueError(f"
|
54 |
-
|
55 |
-
|
56 |
-
#
|
57 |
-
|
58 |
-
VOICE_ASSISTANT = "it-IT-ElsaNeural" # Voce femminile assistente
|
59 |
-
|
60 |
-
async def text_to_speech(text, voice_name, output_file):
|
61 |
-
"""Genera audio usando edge-tts"""
|
62 |
-
communicate = edge_tts.Communicate(text, voice_name)
|
63 |
-
await communicate.save(output_file)
|
64 |
-
|
65 |
-
def generate_speech(text, is_user=True):
|
66 |
-
try:
|
67 |
-
# Crea directory per audio temporanei
|
68 |
-
audio_dir = Path("temp_audio")
|
69 |
-
audio_dir.mkdir(exist_ok=True)
|
70 |
-
|
71 |
-
# Seleziona voce e genera nome file
|
72 |
-
voice = VOICE_USER if is_user else VOICE_ASSISTANT
|
73 |
-
file_name = f"speech_{hash(text)}.mp3"
|
74 |
-
output_path = audio_dir / file_name
|
75 |
-
|
76 |
-
# Genera audio
|
77 |
-
asyncio.run(text_to_speech(text, voice, str(output_path)))
|
78 |
-
return str(output_path)
|
79 |
-
|
80 |
-
except Exception as e:
|
81 |
-
logging.error(f"Errore TTS: {e}")
|
82 |
-
return None
|
83 |
-
|
84 |
-
import re
|
85 |
-
|
86 |
-
def clean_markdown(text):
|
87 |
-
"""Rimuove markdown dal testo"""
|
88 |
-
text = re.sub(r'```[\s\S]*?```', '', text) # blocchi codice
|
89 |
-
text = re.sub(r'`.*?`', '', text) # codice inline
|
90 |
-
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # link
|
91 |
-
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # bold
|
92 |
-
text = re.sub(r'\*(.*?)\*', r'\1', text) # italic
|
93 |
-
return text.strip()
|
94 |
-
|
95 |
-
def generate_chat_audio(chat_history):
|
96 |
-
"""Genera audio della conversazione con voci alternate"""
|
97 |
-
try:
|
98 |
-
audio_files = []
|
99 |
-
audio_dir = Path("temp_audio")
|
100 |
-
audio_dir.mkdir(exist_ok=True)
|
101 |
-
|
102 |
-
# Genera audio per ogni messaggio
|
103 |
-
for msg in chat_history:
|
104 |
-
content = clean_markdown(msg["content"])
|
105 |
-
if not content.strip():
|
106 |
-
continue
|
107 |
-
|
108 |
-
voice = VOICE_USER if msg["role"] == "user" else VOICE_ASSISTANT
|
109 |
-
file_name = f"chat_{msg['role']}_{hash(content)}.mp3"
|
110 |
-
output_path = audio_dir / file_name
|
111 |
-
|
112 |
-
# Genera audio senza prefissi
|
113 |
-
asyncio.run(text_to_speech(content, voice, str(output_path)))
|
114 |
-
audio_files.append(str(output_path))
|
115 |
-
|
116 |
-
# Combina tutti gli audio
|
117 |
-
if audio_files:
|
118 |
-
from pydub import AudioSegment
|
119 |
-
combined = AudioSegment.empty()
|
120 |
-
for audio_file in audio_files:
|
121 |
-
segment = AudioSegment.from_mp3(audio_file)
|
122 |
-
combined += segment
|
123 |
-
|
124 |
-
final_path = audio_dir / f"chat_complete_{hash(str(chat_history))}.mp3"
|
125 |
-
combined.export(str(final_path), format="mp3")
|
126 |
-
return str(final_path)
|
127 |
-
|
128 |
-
return None
|
129 |
-
|
130 |
-
except Exception as e:
|
131 |
-
logging.error(f"Errore generazione audio: {e}")
|
132 |
-
return None
|
133 |
|
134 |
def get_system_prompt(prompt_type="tutor"):
|
135 |
-
"""
|
136 |
return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
|
137 |
|
138 |
def test_local_connection():
|
139 |
-
"""
|
140 |
try:
|
141 |
response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
|
142 |
return response.status_code == 200
|
@@ -150,111 +47,108 @@ def read_metadata(db_path):
|
|
150 |
return json.load(f)
|
151 |
return []
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
166 |
if chat_history is None:
|
167 |
chat_history = []
|
168 |
-
|
169 |
-
logging.info(f"Inizio elaborazione domanda: {question} per database: {db_name}")
|
170 |
-
|
171 |
try:
|
172 |
-
embeddings =
|
173 |
-
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
178 |
return [
|
179 |
{"role": "user", "content": question},
|
180 |
-
{"role": "assistant", "content":
|
181 |
]
|
182 |
-
|
183 |
-
|
184 |
-
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
185 |
-
|
186 |
-
# Cerca i documenti (chunk) più simili
|
187 |
-
relevant_docs = vectorstore.similarity_search(question, k=5)
|
188 |
-
|
189 |
-
metadata_list = read_metadata(db_path)
|
190 |
-
metadata_dict = {m["filename"]: m for m in metadata_list}
|
191 |
-
|
192 |
-
# Logga i chunk recuperati con metadata
|
193 |
-
for idx, doc in enumerate(relevant_docs):
|
194 |
-
logging.info(f"--- Chunk {idx+1} ---")
|
195 |
source_file = doc.metadata.get("source", "Unknown")
|
196 |
-
|
197 |
-
# Recupera i metadata dal file json
|
198 |
if source_file in metadata_dict:
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
# Prepara il contesto dai documenti
|
208 |
-
context = "\n".join([doc.page_content for doc in relevant_docs])
|
209 |
prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
|
210 |
-
|
211 |
-
# Prepara la cronologia completa delle conversazioni
|
212 |
-
conversation_history = []
|
213 |
-
for msg in chat_history: # Rimuovo limite di 4 messaggi
|
214 |
-
conversation_history.append({
|
215 |
-
"role": msg["role"],
|
216 |
-
"content": msg["content"]
|
217 |
-
})
|
218 |
-
|
219 |
-
# Costruisci messaggio con contesto completo
|
220 |
messages = [
|
221 |
{"role": "system", "content": prompt},
|
222 |
-
*
|
223 |
{"role": "user", "content": question}
|
224 |
]
|
225 |
-
|
226 |
-
if "local" in str(llm_type):
|
227 |
-
if not test_local_connection():
|
228 |
-
raise ConnectionError("LM Studio non raggiungibile")
|
229 |
-
|
230 |
client, model = get_llm_client(llm_type)
|
231 |
response = client.chat.completions.create(
|
232 |
model=model,
|
233 |
messages=messages,
|
234 |
temperature=0.7,
|
235 |
-
max_tokens=2048
|
236 |
)
|
237 |
-
answer = response.choices[0].message.content
|
238 |
-
|
239 |
-
# Genera audio per domanda e risposta
|
240 |
-
user_audio = generate_speech(question, is_user=True)
|
241 |
-
assistant_audio = generate_speech(answer, is_user=False)
|
242 |
-
|
243 |
return [
|
244 |
-
{"role": "user", "content": question
|
245 |
-
{"role": "assistant", "content": answer
|
246 |
]
|
247 |
-
|
248 |
except Exception as e:
|
249 |
-
logging.error(f"
|
250 |
-
error_msg = "LLM
|
251 |
return [
|
252 |
{"role": "user", "content": question},
|
253 |
{"role": "assistant", "content": f"⚠️ {error_msg}"}
|
254 |
]
|
255 |
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
if __name__ == "__main__":
|
259 |
-
|
260 |
-
pass
|
|
|
1 |
+
# llm_handling.py
|
2 |
import logging
|
3 |
import os
|
|
|
|
|
|
|
|
|
4 |
from langchain_community.vectorstores import FAISS
|
|
|
|
|
|
|
|
|
|
|
5 |
import requests
|
6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
7 |
+
import json
|
8 |
|
9 |
+
from app.config import BASE_DB_PATH # Ensure correct import
|
10 |
+
from app.config import LLM_CONFIGS, LLMType # Import LLMType and LLM_CONFIGS
|
11 |
from app.configs.prompts import SYSTEM_PROMPTS
|
12 |
+
from app.utils.embedding_utils import get_embeddings
|
13 |
+
from app.utils.voice_utils import generate_speech # Retain import if needed
|
14 |
|
15 |
logging.basicConfig(level=logging.INFO)
|
|
|
16 |
|
17 |
+
# =====================================
|
18 |
+
# Functions related to LLM
|
19 |
+
# =====================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def get_llm_client(llm_type: LLMType):
|
22 |
+
"""Obtains the appropriate client for the selected model"""
|
23 |
config = LLM_CONFIGS.get(llm_type)
|
24 |
if not config:
|
25 |
+
raise ValueError(f"Model {llm_type} not supported")
|
26 |
+
client_class = config["client"]
|
27 |
+
model = config["model"]
|
28 |
+
client = client_class() # Ensure no arguments are needed
|
29 |
+
return client, model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def get_system_prompt(prompt_type="tutor"):
|
32 |
+
"""Selects the appropriate system prompt"""
|
33 |
return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
|
34 |
|
35 |
def test_local_connection():
|
36 |
+
"""Checks connection to the local LLM server"""
|
37 |
try:
|
38 |
response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
|
39 |
return response.status_code == 200
|
|
|
47 |
return json.load(f)
|
48 |
return []
|
49 |
|
50 |
+
def get_relevant_documents(vectorstore, question, min_similarity=0.7):
|
51 |
+
"""Retrieves relevant documents from the vectorstore"""
|
52 |
+
try:
|
53 |
+
enhanced_query = enhance_query(question)
|
54 |
+
docs_and_scores = vectorstore.similarity_search_with_score(
|
55 |
+
enhanced_query,
|
56 |
+
k=8
|
57 |
+
)
|
58 |
+
filtered_docs = [
|
59 |
+
doc for doc, score in docs_and_scores if score >= min_similarity
|
60 |
+
]
|
61 |
+
logging.info(f"Query: {question}")
|
62 |
+
logging.info(f"Documents found: {len(filtered_docs)}")
|
63 |
+
return filtered_docs[:5] if filtered_docs else []
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Error retrieving documents: {e}")
|
66 |
+
return []
|
67 |
+
|
68 |
+
def enhance_query(question):
|
69 |
+
stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
|
70 |
+
words = [w for w in question.lower().split() if w not in stop_words]
|
71 |
+
enhanced_query = " ".join(words)
|
72 |
+
return enhanced_query
|
73 |
|
74 |
+
def log_search_results(question, docs_and_scores):
|
75 |
+
logging.info(f"Query: {question}")
|
76 |
+
for idx, (doc, score) in enumerate(docs_and_scores, 1):
|
77 |
+
logging.info(f"Doc {idx} - Score: {score:.4f}")
|
78 |
+
logging.info(f"Content: {doc.page_content[:100]}...")
|
79 |
+
|
80 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
81 |
+
def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=None):
|
82 |
if chat_history is None:
|
83 |
chat_history = []
|
|
|
|
|
|
|
84 |
try:
|
85 |
+
embeddings = get_embeddings()
|
86 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
87 |
+
metadata_list = read_metadata(db_path)
|
88 |
+
metadata_dict = {m["filename"]: m for m in metadata_list}
|
89 |
+
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
90 |
+
relevant_docs = get_relevant_documents(vectorstore, question)
|
91 |
+
if not relevant_docs:
|
92 |
return [
|
93 |
{"role": "user", "content": question},
|
94 |
+
{"role": "assistant", "content": "Sorry, no relevant information found to answer your question. Try rephrasing or asking a different question."}
|
95 |
]
|
96 |
+
sources = []
|
97 |
+
for idx, doc in enumerate(relevant_docs, 1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
source_file = doc.metadata.get("source", "Unknown")
|
|
|
|
|
99 |
if source_file in metadata_dict:
|
100 |
+
meta = metadata_dict[source_file]
|
101 |
+
sources.append(f"📚 {meta['title']} (Author: {meta['author']}) - Part {idx} of {len(relevant_docs)}")
|
102 |
+
context = "\n".join([
|
103 |
+
f"[Part {idx+1} of {len(relevant_docs)}]\n{doc.page_content}"
|
104 |
+
for idx, doc in enumerate(relevant_docs)
|
105 |
+
])
|
106 |
+
sources_text = "\n\nSources consulted:\n" + "\n".join(set(sources))
|
|
|
|
|
|
|
107 |
prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
|
108 |
+
prompt += "\nAlways cite the sources used for your response, including the document title and author."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
messages = [
|
110 |
{"role": "system", "content": prompt},
|
111 |
+
*[{"role": m["role"], "content": m["content"]} for m in chat_history],
|
112 |
{"role": "user", "content": question}
|
113 |
]
|
|
|
|
|
|
|
|
|
|
|
114 |
client, model = get_llm_client(llm_type)
|
115 |
response = client.chat.completions.create(
|
116 |
model=model,
|
117 |
messages=messages,
|
118 |
temperature=0.7,
|
119 |
+
max_tokens=2048
|
120 |
)
|
121 |
+
answer = response.choices[0].message.content + sources_text
|
|
|
|
|
|
|
|
|
|
|
122 |
return [
|
123 |
+
{"role": "user", "content": question},
|
124 |
+
{"role": "assistant", "content": answer}
|
125 |
]
|
|
|
126 |
except Exception as e:
|
127 |
+
logging.error(f"Error generating response: {e}")
|
128 |
+
error_msg = "Local LLM not available. Try again later or use OpenAI." if "local" in str(llm_type) else str(e)
|
129 |
return [
|
130 |
{"role": "user", "content": question},
|
131 |
{"role": "assistant", "content": f"⚠️ {error_msg}"}
|
132 |
]
|
133 |
|
134 |
+
class DocumentRetriever:
|
135 |
+
def __init__(self, db_path):
|
136 |
+
self.embeddings = get_embeddings()
|
137 |
+
self.vectorstore = FAISS.load_local(
|
138 |
+
db_path,
|
139 |
+
self.embeddings,
|
140 |
+
allow_dangerous_deserialization=True
|
141 |
+
)
|
142 |
+
|
143 |
+
def get_relevant_chunks(self, question):
|
144 |
+
enhanced_query = enhance_query(question)
|
145 |
+
docs_and_scores = self.vectorstore.similarity_search_with_score(
|
146 |
+
enhanced_query,
|
147 |
+
k=8
|
148 |
+
)
|
149 |
+
log_search_results(question, docs_and_scores)
|
150 |
+
# Implement _filter_relevant_docs or remove the call
|
151 |
+
# return self._filter_relevant_docs(docs_and_scores)
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
+
pass
|
|
app/llm_handling2.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# llm_handling.py
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
import requests
|
6 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
7 |
+
import json
|
8 |
+
|
9 |
+
from app.config import *
|
10 |
+
from app.configs.prompts import SYSTEM_PROMPTS
|
11 |
+
from app.utils.embedding_utils import get_embeddings
|
12 |
+
from app.utils.voice_utils import generate_speech
|
13 |
+
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
|
16 |
+
# =====================================
|
17 |
+
# Funzioni relative al LLM
|
18 |
+
# =====================================
|
19 |
+
|
20 |
+
def get_llm_client(llm_type: LLMType):
|
21 |
+
"""Ottiene il client appropriato per il modello selezionato"""
|
22 |
+
config = LLM_CONFIGS.get(llm_type)
|
23 |
+
if not config:
|
24 |
+
raise ValueError(f"Modello {llm_type} non supportato")
|
25 |
+
return config["client"](), config["model"]
|
26 |
+
|
27 |
+
def get_system_prompt(prompt_type="tutor"):
|
28 |
+
"""Seleziona il prompt di sistema appropriato"""
|
29 |
+
return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
|
30 |
+
|
31 |
+
def test_local_connection():
|
32 |
+
"""Verifica la connessione al server LLM locale"""
|
33 |
+
try:
|
34 |
+
response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
|
35 |
+
return response.status_code == 200
|
36 |
+
except:
|
37 |
+
return False
|
38 |
+
|
39 |
+
def read_metadata(db_path):
|
40 |
+
metadata_file = os.path.join(db_path, "metadata.json")
|
41 |
+
if os.path.exists(metadata_file):
|
42 |
+
with open(metadata_file, 'r') as f:
|
43 |
+
return json.load(f)
|
44 |
+
return []
|
45 |
+
|
46 |
+
def get_relevant_documents(vectorstore, question, min_similarity=0.7):
|
47 |
+
"""Recupera i documenti rilevanti dal vectorstore"""
|
48 |
+
try:
|
49 |
+
# Migliora la query prima della ricerca
|
50 |
+
enhanced_query = enhance_query(question)
|
51 |
+
|
52 |
+
# Ottieni documenti con punteggi di similarità
|
53 |
+
docs_and_scores = vectorstore.similarity_search_with_score(
|
54 |
+
enhanced_query,
|
55 |
+
k=8 # Aumenta il numero di documenti recuperati
|
56 |
+
)
|
57 |
+
|
58 |
+
# Filtra i documenti per similarità
|
59 |
+
filtered_docs = [
|
60 |
+
doc for doc, score in docs_and_scores
|
61 |
+
if score >= min_similarity
|
62 |
+
]
|
63 |
+
|
64 |
+
# Log dei risultati per debug
|
65 |
+
logging.info(f"Query: {question}")
|
66 |
+
logging.info(f"Documenti trovati: {len(filtered_docs)}")
|
67 |
+
|
68 |
+
# Restituisci almeno un documento o una lista vuota
|
69 |
+
return filtered_docs[:5] if filtered_docs else []
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
logging.error(f"Errore nel recupero dei documenti: {e}")
|
73 |
+
return [] # Restituisce lista vuota invece di None
|
74 |
+
|
75 |
+
def enhance_query(question):
|
76 |
+
# Rimuovi parole non significative
|
77 |
+
stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
|
78 |
+
words = [w for w in question.lower().split() if w not in stop_words]
|
79 |
+
|
80 |
+
# Estrai keywords chiave
|
81 |
+
enhanced_query = " ".join(words)
|
82 |
+
return enhanced_query
|
83 |
+
|
84 |
+
def log_search_results(question, docs_and_scores):
|
85 |
+
logging.info(f"Query: {question}")
|
86 |
+
for idx, (doc, score) in enumerate(docs_and_scores, 1):
|
87 |
+
logging.info(f"Doc {idx} - Score: {score:.4f}")
|
88 |
+
logging.info(f"Content: {doc.page_content[:100]}...")
|
89 |
+
|
90 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
91 |
+
def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
|
92 |
+
if chat_history is None:
|
93 |
+
chat_history = []
|
94 |
+
|
95 |
+
try:
|
96 |
+
embeddings = get_embeddings() # Usa la funzione comune
|
97 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
98 |
+
|
99 |
+
# Leggi i metadati
|
100 |
+
metadata_list = read_metadata(db_path)
|
101 |
+
metadata_dict = {m["filename"]: m for m in metadata_list}
|
102 |
+
|
103 |
+
# Recupera i documenti rilevanti
|
104 |
+
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
|
105 |
+
relevant_docs = get_relevant_documents(vectorstore, question)
|
106 |
+
|
107 |
+
if not relevant_docs:
|
108 |
+
return [
|
109 |
+
{"role": "user", "content": question},
|
110 |
+
{"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti per rispondere alla tua domanda. Prova a riformularla o a fare una domanda diversa."}
|
111 |
+
]
|
112 |
+
|
113 |
+
# Prepara le citazioni delle fonti con numerazione dei chunk
|
114 |
+
sources = []
|
115 |
+
for idx, doc in enumerate(relevant_docs, 1):
|
116 |
+
source_file = doc.metadata.get("source", "Unknown")
|
117 |
+
if source_file in metadata_dict:
|
118 |
+
meta = metadata_dict[source_file]
|
119 |
+
sources.append(f"📚 {meta['title']} (Autore: {meta['author']}) - Parte {idx} di {len(relevant_docs)}")
|
120 |
+
|
121 |
+
# Prepara il contesto con le fonti
|
122 |
+
context = "\n".join([
|
123 |
+
f"[Parte {idx+1} di {len(relevant_docs)}]\n{doc.page_content}"
|
124 |
+
for idx, doc in enumerate(relevant_docs)
|
125 |
+
])
|
126 |
+
sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
|
127 |
+
|
128 |
+
# Aggiorna il prompt per includere la richiesta di citare le fonti
|
129 |
+
prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
|
130 |
+
prompt += "\nCita sempre le fonti utilizzate per la tua risposta includendo il titolo del documento e l'autore."
|
131 |
+
|
132 |
+
# Costruisci il messaggio completo
|
133 |
+
messages = [
|
134 |
+
{"role": "system", "content": prompt},
|
135 |
+
*[{"role": m["role"], "content": m["content"]} for m in chat_history],
|
136 |
+
{"role": "user", "content": question}
|
137 |
+
]
|
138 |
+
|
139 |
+
# Ottieni la risposta dall'LLM
|
140 |
+
client, model = get_llm_client(llm_type)
|
141 |
+
response = client.chat.completions.create(
|
142 |
+
model=model,
|
143 |
+
messages=messages,
|
144 |
+
temperature=0.7,
|
145 |
+
max_tokens=2048
|
146 |
+
)
|
147 |
+
answer = response.choices[0].message.content + sources_text
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
# return [
|
153 |
+
# {"role": "user", "content": question, "audio": user_audio},
|
154 |
+
# {"role": "assistant", "content": answer, "audio": assistant_audio}
|
155 |
+
# ]
|
156 |
+
|
157 |
+
except Exception as e:
|
158 |
+
logging.error(f"Errore durante la generazione della risposta: {e}")
|
159 |
+
error_msg = "LLM locale non disponibile. Riprova più tardi o usa OpenAI." if "local" in str(llm_type) else str(e)
|
160 |
+
return [
|
161 |
+
{"role": "user", "content": question},
|
162 |
+
{"role": "assistant", "content": f"⚠️ {error_msg}"}
|
163 |
+
]
|
164 |
+
|
165 |
+
class DocumentRetriever:
|
166 |
+
def __init__(self, db_path):
|
167 |
+
self.embeddings = get_embeddings()
|
168 |
+
self.vectorstore = FAISS.load_local(
|
169 |
+
db_path,
|
170 |
+
self.embeddings,
|
171 |
+
allow_dangerous_deserialization=True
|
172 |
+
)
|
173 |
+
|
174 |
+
def get_relevant_chunks(self, question):
|
175 |
+
enhanced_query = enhance_query(question)
|
176 |
+
docs_and_scores = self.vectorstore.similarity_search_with_score(
|
177 |
+
enhanced_query,
|
178 |
+
k=8
|
179 |
+
)
|
180 |
+
|
181 |
+
log_search_results(question, docs_and_scores)
|
182 |
+
return self._filter_relevant_docs(docs_and_scores)
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
pass
|
{utils → app/utils}/__init__.py
RENAMED
File without changes
|
app/utils/dataclass_utils.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from app.functions.database_handling import BASE_DB_PATH
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class DocumentMetadata:
|
8 |
+
"""
|
9 |
+
Classe per gestire i metadati dei documenti.
|
10 |
+
|
11 |
+
Attributi:
|
12 |
+
filename (str): Nome del file originale
|
13 |
+
title (str): Titolo assegnato al documento
|
14 |
+
author (str): Autore del documento
|
15 |
+
upload_date (str): Data di caricamento
|
16 |
+
chunks (int): Numero di chunks in cui è stato diviso il documento
|
17 |
+
"""
|
18 |
+
filename: str
|
19 |
+
title: str
|
20 |
+
author: str
|
21 |
+
upload_date: str
|
22 |
+
chunks: int
|
23 |
+
|
24 |
+
def to_dict(self):
|
25 |
+
"""Converte i metadati in un dizionario per il salvataggio JSON."""
|
26 |
+
return {
|
27 |
+
"filename": self.filename,
|
28 |
+
"title": self.title,
|
29 |
+
"author": self.author,
|
30 |
+
"upload_date": self.upload_date,
|
31 |
+
"chunks": self.chunks
|
32 |
+
}
|
33 |
+
|
34 |
+
def save_metadata(metadata_list, db_name):
|
35 |
+
"""
|
36 |
+
Salva i metadati dei documenti nel database specificato.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
metadata_list: Lista di oggetti DocumentMetadata da salvare
|
40 |
+
db_name: Nome del database in cui salvare i metadati
|
41 |
+
|
42 |
+
Note:
|
43 |
+
I metadati vengono salvati in un file JSON nella directory del database
|
44 |
+
"""
|
45 |
+
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
|
46 |
+
metadata_file = os.path.join(db_path, "metadata.json")
|
47 |
+
|
48 |
+
if not os.path.exists(db_path):
|
49 |
+
os.makedirs(db_path)
|
50 |
+
|
51 |
+
existing_metadata = []
|
52 |
+
if os.path.exists(metadata_file):
|
53 |
+
with open(metadata_file, 'r') as f:
|
54 |
+
existing_metadata = json.load(f)
|
55 |
+
|
56 |
+
existing_metadata.extend([m.to_dict() for m in metadata_list])
|
57 |
+
|
58 |
+
with open(metadata_file, 'w') as f:
|
59 |
+
json.dump(existing_metadata, f, indent=2)
|
app/utils/embedding_utils.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
from app.config import EMBEDDING_CONFIG
|
4 |
+
def get_embeddings():
|
5 |
+
|
6 |
+
"""Inizializza gli embeddings usando il modello configurato"""
|
7 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
8 |
+
return HuggingFaceEmbeddings(
|
9 |
+
model_name=EMBEDDING_CONFIG["model_name"],
|
10 |
+
model_kwargs={'device': device}
|
11 |
+
)
|
12 |
+
|
{utils → app/utils}/helpers.py
RENAMED
File without changes
|
app/utils/markdowns_utils.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
# =====================================
|
4 |
+
# Funzioni relative al Markdown
|
5 |
+
# =====================================
|
6 |
+
|
7 |
+
def clean_markdown(text):
|
8 |
+
"""Rimuove markdown dal testo"""
|
9 |
+
text = re.sub(r'```[\s\S]*?```', '', text) # blocchi codice
|
10 |
+
text = re.sub(r'`.*?`', '', text) # codice inline
|
11 |
+
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # link
|
12 |
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # bold
|
13 |
+
text = re.sub(r'\*(.*?)\*', r'\1', text) # italic
|
14 |
+
return text.strip()
|
app/utils/voice_utils.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import asyncio
|
3 |
+
import edge_tts
|
4 |
+
from app.config import VOICE_USER, VOICE_ASSISTANT
|
5 |
+
from pathlib import Path
|
6 |
+
from app.utils.markdowns_utils import clean_markdown
|
7 |
+
|
8 |
+
|
9 |
+
async def text_to_speech(text, voice_name, output_file):
|
10 |
+
"""Genera audio usando edge-tts"""
|
11 |
+
communicate = edge_tts.Communicate(text, voice_name)
|
12 |
+
await communicate.save(output_file)
|
13 |
+
|
14 |
+
def generate_speech(text, is_user=True):
|
15 |
+
try:
|
16 |
+
# Crea directory per audio temporanei
|
17 |
+
audio_dir = Path("temp_audio")
|
18 |
+
audio_dir.mkdir(exist_ok=True)
|
19 |
+
|
20 |
+
# Seleziona voce e genera nome file
|
21 |
+
voice = VOICE_USER if is_user else VOICE_ASSISTANT
|
22 |
+
file_name = f"speech_{hash(text)}.mp3"
|
23 |
+
output_path = audio_dir / file_name
|
24 |
+
|
25 |
+
# Genera audio
|
26 |
+
asyncio.run(text_to_speech(text, voice, str(output_path)))
|
27 |
+
return str(output_path)
|
28 |
+
|
29 |
+
except Exception as e:
|
30 |
+
logging.error(f"Errore TTS: {e}")
|
31 |
+
return None
|
32 |
+
|
33 |
+
def generate_chat_audio(chat_history):
|
34 |
+
"""Genera audio della conversazione con voci alternate"""
|
35 |
+
try:
|
36 |
+
audio_files = []
|
37 |
+
audio_dir = Path("temp_audio")
|
38 |
+
audio_dir.mkdir(exist_ok=True)
|
39 |
+
|
40 |
+
# Genera audio per ogni messaggio
|
41 |
+
for msg in chat_history:
|
42 |
+
content = clean_markdown(msg["content"])
|
43 |
+
if not content.strip():
|
44 |
+
continue
|
45 |
+
|
46 |
+
voice = VOICE_USER if msg["role"] == "user" else VOICE_ASSISTANT
|
47 |
+
file_name = f"chat_{msg['role']}_{hash(content)}.mp3"
|
48 |
+
output_path = audio_dir / file_name
|
49 |
+
|
50 |
+
# Genera audio senza prefissi
|
51 |
+
asyncio.run(text_to_speech(content, voice, str(output_path)))
|
52 |
+
audio_files.append(str(output_path))
|
53 |
+
|
54 |
+
# Combina tutti gli audio
|
55 |
+
if audio_files:
|
56 |
+
from pydub import AudioSegment
|
57 |
+
|
58 |
+
combined = AudioSegment.empty()
|
59 |
+
for audio_file in audio_files:
|
60 |
+
segment = AudioSegment.from_mp3(audio_file)
|
61 |
+
combined += segment
|
62 |
+
|
63 |
+
final_path = audio_dir / f"chat_complete_{hash(str(chat_history))}.mp3"
|
64 |
+
combined.export(str(final_path), format="mp3")
|
65 |
+
return str(final_path)
|
66 |
+
|
67 |
+
return None
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
logging.error(f"Errore generazione audio: {e}")
|
71 |
+
return None
|
db/.DS_Store
CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ
|
|
db/faiss_index/index.faiss
DELETED
Binary file (1.58 kB)
|
|
db/faiss_index/index.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:407d95e0808ddf251e3fb442241edd72c47961f5a38d5546021ef205b9fdeb57
|
3 |
-
size 960117
|
|
|
|
|
|
|
|
db/faiss_index_Daniele2/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
db/faiss_index_Daniele2/index.faiss
DELETED
Binary file (3.12 kB)
|
|
db/faiss_index_Daniele2/index.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e79bcca55b5153ea71218a3d2204c01ec1eccf59162fd4547d19956a4750d04e
|
3 |
-
size 2958
|
|
|
|
|
|
|
|
db/faiss_index_Daniele2/metadata.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"filename": "istruzioni obiettivi di apprendimento.pdf",
|
4 |
-
"title": "Obiettivi di apprendimento",
|
5 |
-
"author": "Daniele",
|
6 |
-
"upload_date": "2025-01-02 15:14:19",
|
7 |
-
"chunks": 2
|
8 |
-
}
|
9 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db/faiss_index_Orienta/index.faiss
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d4ffbc57fcbef553e507d44c7708b4e23b947f5af97c13d97359f3d814fc562a
|
3 |
-
size 2362413
|
|
|
|
|
|
|
|
db/faiss_index_Orienta/index.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a89e4e492e28b30ef9ede1fd176dfcdac5e973884a56ab9d217a695136be8349
|
3 |
-
size 3303433
|
|
|
|
|
|
|
|
db/faiss_index_Orienta/metadata.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"filename": "Imparare a dirigere se stessi.pdf",
|
4 |
-
"title": "Imparare a dirigere se stessi ",
|
5 |
-
"author": "Pellerey",
|
6 |
-
"upload_date": "2025-01-02 22:47:28",
|
7 |
-
"chunks": 1538
|
8 |
-
}
|
9 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db/faiss_index_default_db/index.faiss
DELETED
Binary file (309 kB)
|
|
db/faiss_index_default_db/index.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4c797df1c4a8ddac75b4b083391220179ce5bbcd2b962b4dfbc7d960628cd0b2
|
3 |
-
size 107706
|
|
|
|
|
|
|
|
ui/chatbot_tab.py
CHANGED
@@ -4,12 +4,13 @@ import logging
|
|
4 |
import gradio as gr
|
5 |
from app.functions.database_handling import list_databases
|
6 |
from app.configs.prompts import SYSTEM_PROMPTS
|
7 |
-
from app.llm_handling import answer_question, LLMType
|
8 |
-
from utils.helpers import extract_text_from_files
|
9 |
-
|
10 |
-
|
11 |
|
12 |
|
|
|
13 |
|
14 |
|
15 |
def create_chatbot_tab():
|
@@ -87,26 +88,26 @@ def create_chatbot_tab():
|
|
87 |
|
88 |
return str(Path(temp_path).absolute())
|
89 |
|
90 |
-
def download_audio(chat_history):
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
|
104 |
-
|
105 |
-
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
return None
|
110 |
|
111 |
def format_conversation_for_audio(chat_history):
|
112 |
"""Formatta la conversazione per la sintesi vocale"""
|
@@ -116,25 +117,25 @@ def create_chatbot_tab():
|
|
116 |
audio_text.append(f"{role} dice: {msg['content']}")
|
117 |
return "\n".join(audio_text)
|
118 |
|
119 |
-
def generate_conversation_audio(chat_history):
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
|
126 |
-
|
127 |
-
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
|
139 |
def convert_chat_to_audio(chat_history):
|
140 |
if not chat_history:
|
|
|
4 |
import gradio as gr
|
5 |
from app.functions.database_handling import list_databases
|
6 |
from app.configs.prompts import SYSTEM_PROMPTS
|
7 |
+
from app.llm_handling import answer_question, LLMType
|
8 |
+
from app.utils.helpers import extract_text_from_files
|
9 |
+
from app.utils.voice_utils import *
|
10 |
+
from app.utils.markdowns_utils import clean_markdown
|
11 |
|
12 |
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
|
15 |
|
16 |
def create_chatbot_tab():
|
|
|
88 |
|
89 |
return str(Path(temp_path).absolute())
|
90 |
|
91 |
+
# def download_audio(chat_history):
|
92 |
+
# """Scarica l'ultimo messaggio audio dalla chat"""
|
93 |
+
# try:
|
94 |
+
# if not chat_history:
|
95 |
+
# gr.Warning("Nessun messaggio nella chat")
|
96 |
+
# return None
|
97 |
|
98 |
+
# # Prendi l'ultimo messaggio assistant
|
99 |
+
# for msg in reversed(chat_history):
|
100 |
+
# if msg["role"] == "assistant" and "audio" in msg:
|
101 |
+
# audio_path = msg["audio"]
|
102 |
+
# if audio_path and os.path.exists(audio_path):
|
103 |
+
# return audio_path
|
104 |
|
105 |
+
# gr.Warning("Nessun audio disponibile per l'ultima risposta")
|
106 |
+
# return None
|
107 |
|
108 |
+
# except Exception as e:
|
109 |
+
# gr.Error(f"Errore durante il download dell'audio: {str(e)}")
|
110 |
+
# return None
|
111 |
|
112 |
def format_conversation_for_audio(chat_history):
|
113 |
"""Formatta la conversazione per la sintesi vocale"""
|
|
|
117 |
audio_text.append(f"{role} dice: {msg['content']}")
|
118 |
return "\n".join(audio_text)
|
119 |
|
120 |
+
# def generate_conversation_audio(chat_history):
|
121 |
+
# """Genera audio della conversazione completa"""
|
122 |
+
# try:
|
123 |
+
# if not chat_history:
|
124 |
+
# gr.Warning("Nessun messaggio nella chat")
|
125 |
+
# return None
|
126 |
|
127 |
+
# conversation_text = format_conversation_for_audio(chat_history)
|
128 |
+
# audio_path = generate_speech(conversation_text, is_user=False)
|
129 |
|
130 |
+
# if audio_path and os.path.exists(audio_path):
|
131 |
+
# return audio_path
|
132 |
+
# else:
|
133 |
+
# gr.Warning("Errore nella generazione dell'audio")
|
134 |
+
# return None
|
135 |
|
136 |
+
# except Exception as e:
|
137 |
+
# gr.Error(f"Errore: {str(e)}")
|
138 |
+
# return None
|
139 |
|
140 |
def convert_chat_to_audio(chat_history):
|
141 |
if not chat_history:
|
ui/new_features_tab.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
# ui/new_features_tab.py
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
from app.document_handling import search_documents
|
5 |
-
from app.functions.database_handling import list_databases
|
6 |
-
|
7 |
-
def create_new_features_tab():
|
8 |
-
"""Crea il tab 'Nuove Funzionalità' dell'interfaccia Gradio."""
|
9 |
-
|
10 |
-
def search_documents_callback(query, db_name):
|
11 |
-
"""Cerca documenti nel database in base alla query."""
|
12 |
-
results = search_documents(query, db_name)
|
13 |
-
return "\n".join(results)
|
14 |
-
|
15 |
-
# Ottieni la lista dei database
|
16 |
-
databases = list_databases()
|
17 |
-
|
18 |
-
with gr.Tab("Nuove Funzionalità"):
|
19 |
-
gr.Markdown("## Cerca Documenti e Genera Riassunto")
|
20 |
-
|
21 |
-
db_name_new = gr.Dropdown(choices=databases, label="Seleziona Database", value="default_db")
|
22 |
-
search_input = gr.Textbox(label="Inserisci Termini di Ricerca")
|
23 |
-
search_button = gr.Button("Cerca Documenti")
|
24 |
-
search_output = gr.Textbox(label="Documenti Trovati")
|
25 |
-
|
26 |
-
summary_button = gr.Button("Genera Riassunto")
|
27 |
-
summary_output = gr.Textbox(label="Riassunto")
|
28 |
-
|
29 |
-
# Evento per il bottone di ricerca
|
30 |
-
search_button.click(
|
31 |
-
search_documents_callback,
|
32 |
-
inputs=[search_input, db_name_new],
|
33 |
-
outputs=search_output
|
34 |
-
)
|
35 |
-
|
36 |
-
# Evento per il bottone di generazione riassunto (implementare generate_summary se necessario)
|
37 |
-
# summary_button.click(
|
38 |
-
# generate_summary,
|
39 |
-
# inputs=db_name_new,
|
40 |
-
# outputs=summary_output
|
41 |
-
# )
|
42 |
-
|
43 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|