Nugh75 commited on
Commit
3c5ed5b
·
1 Parent(s): f622cac

update struttura

Browse files
app.py CHANGED
@@ -3,7 +3,7 @@
3
  import gradio as gr
4
  import logging
5
  from app.logging_config import configure_logging
6
- from app.functions.database_handling import list_databases
7
  from ui.chatbot_tab import create_chatbot_tab
8
  from ui.db_management_tab import create_db_management_tab
9
  from ui.document_management_tab import create_document_management_tab
 
3
  import gradio as gr
4
  import logging
5
  from app.logging_config import configure_logging
6
+ from app.utils.database_handling import list_databases
7
  from ui.chatbot_tab import create_chatbot_tab
8
  from ui.db_management_tab import create_db_management_tab
9
  from ui.document_management_tab import create_document_management_tab
app/document_handling.py CHANGED
@@ -1,70 +1,13 @@
1
  import logging
2
- import gradio as gr
3
  from langchain_community.vectorstores import FAISS
4
  import os
5
- import PyPDF2
6
- from docx import Document
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  import json
9
  from datetime import datetime
10
- from app.functions.database_handling import BASE_DB_PATH
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
- from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
13
- from app.utils.embedding_utils import get_embeddings
14
  from app.utils.dataclass_utils import DocumentMetadata, save_metadata
15
-
16
-
17
- # -------------- UTILITY FUNCTIONS --------------
18
-
19
- def extract_text_from_pdf(file_path):
20
- """
21
- Estrae il testo da un file PDF.
22
-
23
- Args:
24
- file_path: Percorso del file PDF
25
-
26
- Returns:
27
- str: Testo estratto dal PDF
28
- """
29
- with open(file_path, 'rb') as f:
30
- reader = PyPDF2.PdfReader(f)
31
- text = ""
32
- for page in reader.pages:
33
- text += page.extract_text()
34
- return text
35
-
36
- def extract_text_from_docx(file_path):
37
- """
38
- Estrae il testo da un file DOCX.
39
-
40
- Args:
41
- file_path: Percorso del file DOCX
42
-
43
- Returns:
44
- str: Testo estratto dal documento Word
45
- """
46
- doc = Document(file_path)
47
- text = ""
48
- for para in doc.paragraphs:
49
- text += para.text + "\n"
50
- return text
51
-
52
- def create_chunks(text):
53
- from app.config import EMBEDDING_CONFIG
54
- text_splitter = RecursiveCharacterTextSplitter(
55
- chunk_size=EMBEDDING_CONFIG["chunk_size"],
56
- chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
57
- length_function=len,
58
- separators=["\n\n", "\n", " ", ""]
59
- )
60
- return text_splitter.split_text(text)
61
-
62
-
63
- def create_vectorstore(texts, metadatas, db_path):
64
- embeddings = get_embeddings()
65
- db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
66
-
67
-
68
 
69
 
70
  # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
 
1
  import logging
 
2
  from langchain_community.vectorstores import FAISS
3
  import os
 
 
 
4
  import json
5
  from datetime import datetime
6
+ from app.utils.database_handling import BASE_DB_PATH
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from app.utils.embedding_utils import *
 
9
  from app.utils.dataclass_utils import DocumentMetadata, save_metadata
10
+ from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  # -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
app/functions/__init__.py DELETED
File without changes
app/llm_handling.py CHANGED
@@ -10,7 +10,7 @@ from app.config import BASE_DB_PATH # Ensure correct import
10
  from app.config import LLM_CONFIGS, LLMType # Import LLMType and LLM_CONFIGS
11
  from app.configs.prompts import SYSTEM_PROMPTS
12
  from app.utils.embedding_utils import get_embeddings
13
- from app.utils.voice_utils import generate_speech # Retain import if needed
14
 
15
  logging.basicConfig(level=logging.INFO)
16
 
 
10
  from app.config import LLM_CONFIGS, LLMType # Import LLMType and LLM_CONFIGS
11
  from app.configs.prompts import SYSTEM_PROMPTS
12
  from app.utils.embedding_utils import get_embeddings
13
+
14
 
15
  logging.basicConfig(level=logging.INFO)
16
 
app/llm_handling2.py DELETED
@@ -1,188 +0,0 @@
1
- # llm_handling.py
2
- import logging
3
- import os
4
- from langchain_community.vectorstores import FAISS
5
- import requests
6
- from tenacity import retry, stop_after_attempt, wait_exponential
7
- import json
8
-
9
- from app.config import *
10
- from app.configs.prompts import SYSTEM_PROMPTS
11
- from app.utils.embedding_utils import get_embeddings
12
- from app.utils.voice_utils import generate_speech
13
-
14
- logging.basicConfig(level=logging.INFO)
15
-
16
- # =====================================
17
- # Funzioni relative al LLM
18
- # =====================================
19
-
20
- def get_llm_client(llm_type: LLMType):
21
- """Ottiene il client appropriato per il modello selezionato"""
22
- config = LLM_CONFIGS.get(llm_type)
23
- if not config:
24
- raise ValueError(f"Modello {llm_type} non supportato")
25
- return config["client"](), config["model"]
26
-
27
- def get_system_prompt(prompt_type="tutor"):
28
- """Seleziona il prompt di sistema appropriato"""
29
- return SYSTEM_PROMPTS.get(prompt_type, SYSTEM_PROMPTS["tutor"])
30
-
31
- def test_local_connection():
32
- """Verifica la connessione al server LLM locale"""
33
- try:
34
- response = requests.get(f"http://192.168.82.5:1234/v1/health", timeout=5)
35
- return response.status_code == 200
36
- except:
37
- return False
38
-
39
- def read_metadata(db_path):
40
- metadata_file = os.path.join(db_path, "metadata.json")
41
- if os.path.exists(metadata_file):
42
- with open(metadata_file, 'r') as f:
43
- return json.load(f)
44
- return []
45
-
46
- def get_relevant_documents(vectorstore, question, min_similarity=0.7):
47
- """Recupera i documenti rilevanti dal vectorstore"""
48
- try:
49
- # Migliora la query prima della ricerca
50
- enhanced_query = enhance_query(question)
51
-
52
- # Ottieni documenti con punteggi di similarità
53
- docs_and_scores = vectorstore.similarity_search_with_score(
54
- enhanced_query,
55
- k=8 # Aumenta il numero di documenti recuperati
56
- )
57
-
58
- # Filtra i documenti per similarità
59
- filtered_docs = [
60
- doc for doc, score in docs_and_scores
61
- if score >= min_similarity
62
- ]
63
-
64
- # Log dei risultati per debug
65
- logging.info(f"Query: {question}")
66
- logging.info(f"Documenti trovati: {len(filtered_docs)}")
67
-
68
- # Restituisci almeno un documento o una lista vuota
69
- return filtered_docs[:5] if filtered_docs else []
70
-
71
- except Exception as e:
72
- logging.error(f"Errore nel recupero dei documenti: {e}")
73
- return [] # Restituisce lista vuota invece di None
74
-
75
- def enhance_query(question):
76
- # Rimuovi parole non significative
77
- stop_words = set(['il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una'])
78
- words = [w for w in question.lower().split() if w not in stop_words]
79
-
80
- # Estrai keywords chiave
81
- enhanced_query = " ".join(words)
82
- return enhanced_query
83
-
84
- def log_search_results(question, docs_and_scores):
85
- logging.info(f"Query: {question}")
86
- for idx, (doc, score) in enumerate(docs_and_scores, 1):
87
- logging.info(f"Doc {idx} - Score: {score:.4f}")
88
- logging.info(f"Content: {doc.page_content[:100]}...")
89
-
90
- @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
91
- def answer_question(question, db_name, prompt_type="tutor", chat_history=None, llm_type=LLMType.OPENAI_GPT_4O_MINI):
92
- if chat_history is None:
93
- chat_history = []
94
-
95
- try:
96
- embeddings = get_embeddings() # Usa la funzione comune
97
- db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}")
98
-
99
- # Leggi i metadati
100
- metadata_list = read_metadata(db_path)
101
- metadata_dict = {m["filename"]: m for m in metadata_list}
102
-
103
- # Recupera i documenti rilevanti
104
- vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
105
- relevant_docs = get_relevant_documents(vectorstore, question)
106
-
107
- if not relevant_docs:
108
- return [
109
- {"role": "user", "content": question},
110
- {"role": "assistant", "content": "Mi dispiace, non ho trovato informazioni rilevanti per rispondere alla tua domanda. Prova a riformularla o a fare una domanda diversa."}
111
- ]
112
-
113
- # Prepara le citazioni delle fonti con numerazione dei chunk
114
- sources = []
115
- for idx, doc in enumerate(relevant_docs, 1):
116
- source_file = doc.metadata.get("source", "Unknown")
117
- if source_file in metadata_dict:
118
- meta = metadata_dict[source_file]
119
- sources.append(f"📚 {meta['title']} (Autore: {meta['author']}) - Parte {idx} di {len(relevant_docs)}")
120
-
121
- # Prepara il contesto con le fonti
122
- context = "\n".join([
123
- f"[Parte {idx+1} di {len(relevant_docs)}]\n{doc.page_content}"
124
- for idx, doc in enumerate(relevant_docs)
125
- ])
126
- sources_text = "\n\nFonti consultate:\n" + "\n".join(set(sources))
127
-
128
- # Aggiorna il prompt per includere la richiesta di citare le fonti
129
- prompt = SYSTEM_PROMPTS[prompt_type].format(context=context)
130
- prompt += "\nCita sempre le fonti utilizzate per la tua risposta includendo il titolo del documento e l'autore."
131
-
132
- # Costruisci il messaggio completo
133
- messages = [
134
- {"role": "system", "content": prompt},
135
- *[{"role": m["role"], "content": m["content"]} for m in chat_history],
136
- {"role": "user", "content": question}
137
- ]
138
-
139
- # Ottieni la risposta dall'LLM
140
- client, model = get_llm_client(llm_type)
141
- response = client.chat.completions.create(
142
- model=model,
143
- messages=messages,
144
- temperature=0.7,
145
- max_tokens=2048
146
- )
147
- answer = response.choices[0].message.content + sources_text
148
-
149
-
150
-
151
-
152
- # return [
153
- # {"role": "user", "content": question, "audio": user_audio},
154
- # {"role": "assistant", "content": answer, "audio": assistant_audio}
155
- # ]
156
-
157
- except Exception as e:
158
- logging.error(f"Errore durante la generazione della risposta: {e}")
159
- error_msg = "LLM locale non disponibile. Riprova più tardi o usa OpenAI." if "local" in str(llm_type) else str(e)
160
- return [
161
- {"role": "user", "content": question},
162
- {"role": "assistant", "content": f"⚠️ {error_msg}"}
163
- ]
164
-
165
- class DocumentRetriever:
166
- def __init__(self, db_path):
167
- self.embeddings = get_embeddings()
168
- self.vectorstore = FAISS.load_local(
169
- db_path,
170
- self.embeddings,
171
- allow_dangerous_deserialization=True
172
- )
173
-
174
- def get_relevant_chunks(self, question):
175
- enhanced_query = enhance_query(question)
176
- docs_and_scores = self.vectorstore.similarity_search_with_score(
177
- enhanced_query,
178
- k=8
179
- )
180
-
181
- log_search_results(question, docs_and_scores)
182
- return self._filter_relevant_docs(docs_and_scores)
183
-
184
-
185
-
186
-
187
- if __name__ == "__main__":
188
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/{functions → utils}/database_handling.py RENAMED
File without changes
app/utils/dataclass_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import json
3
  from dataclasses import dataclass
4
- from app.functions.database_handling import BASE_DB_PATH
5
 
6
  @dataclass
7
  class DocumentMetadata:
 
1
  import os
2
  import json
3
  from dataclasses import dataclass
4
+ from app.utils.database_handling import BASE_DB_PATH
5
 
6
  @dataclass
7
  class DocumentMetadata:
app/utils/embedding_utils.py CHANGED
@@ -1,6 +1,8 @@
1
  import torch
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
  from app.config import EMBEDDING_CONFIG
 
 
4
  def get_embeddings():
5
 
6
  """Inizializza gli embeddings usando il modello configurato"""
@@ -9,4 +11,20 @@ def get_embeddings():
9
  model_name=EMBEDDING_CONFIG["model_name"],
10
  model_kwargs={'device': device}
11
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
1
  import torch
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
  from app.config import EMBEDDING_CONFIG
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL
6
  def get_embeddings():
7
 
8
  """Inizializza gli embeddings usando il modello configurato"""
 
11
  model_name=EMBEDDING_CONFIG["model_name"],
12
  model_kwargs={'device': device}
13
  )
14
+ def create_chunks(text):
15
+ from app.config import EMBEDDING_CONFIG
16
+ text_splitter = RecursiveCharacterTextSplitter(
17
+ chunk_size=EMBEDDING_CONFIG["chunk_size"],
18
+ chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"],
19
+ length_function=len,
20
+ separators=["\n\n", "\n", " ", ""]
21
+ )
22
+ return text_splitter.split_text(text)
23
+
24
+
25
+ def create_vectorstore(texts, metadatas, db_path):
26
+ embeddings = get_embeddings()
27
+ db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
28
+
29
+
30
 
app/utils/extract_utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from docx import Document
3
+
4
+ def extract_text_from_pdf(file_path):
5
+ """
6
+ Estrae il testo da un file PDF.
7
+
8
+ Args:
9
+ file_path: Percorso del file PDF
10
+
11
+ Returns:
12
+ str: Testo estratto dal PDF
13
+ """
14
+ with open(file_path, 'rb') as f:
15
+ reader = PyPDF2.PdfReader(f)
16
+ text = ""
17
+ for page in reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+ def extract_text_from_docx(file_path):
22
+ """
23
+ Estrae il testo da un file DOCX.
24
+
25
+ Args:
26
+ file_path: Percorso del file DOCX
27
+
28
+ Returns:
29
+ str: Testo estratto dal documento Word
30
+ """
31
+ doc = Document(file_path)
32
+ text = ""
33
+ for para in doc.paragraphs:
34
+ text += para.text + "\n"
35
+ return text
ui/chatbot_tab.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import logging
4
  import gradio as gr
5
- from app.functions.database_handling import list_databases
6
  from app.configs.prompts import SYSTEM_PROMPTS
7
  from app.llm_handling import answer_question, LLMType
8
  from app.utils.helpers import extract_text_from_files
 
2
 
3
  import logging
4
  import gradio as gr
5
+ from app.utils.database_handling import list_databases
6
  from app.configs.prompts import SYSTEM_PROMPTS
7
  from app.llm_handling import answer_question, LLMType
8
  from app.utils.helpers import extract_text_from_files
ui/db_management_tab.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from app.functions.database_handling import create_database, modify_database, delete_database, list_databases
3
 
4
  def create_db_management_tab(dropdowns):
5
  databases = list_databases()
 
1
  import gradio as gr
2
+ from app.utils.database_handling import create_database, modify_database, delete_database, list_databases
3
 
4
  def create_db_management_tab(dropdowns):
5
  databases = list_databases()
ui/document_management_tab.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import logging
3
  from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
4
- from app.functions.database_handling import list_databases
5
 
6
  def create_document_management_tab():
7
  """Crea il tab 'Gestione Documenti' dell'interfaccia Gradio."""
 
1
  import gradio as gr
2
  import logging
3
  from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
4
+ from app.utils.database_handling import list_databases
5
 
6
  def create_document_management_tab():
7
  """Crea il tab 'Gestione Documenti' dell'interfaccia Gradio."""
ui/management_tabs.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import logging
3
  from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
4
- from app.functions.database_handling import (
5
  create_database,
6
  modify_database,
7
  delete_database,
 
1
  import gradio as gr
2
  import logging
3
  from app.document_handling import upload_and_index, list_indexed_files, delete_file_from_database
4
+ from app.utils.database_handling import (
5
  create_database,
6
  modify_database,
7
  delete_database,