Spaces:
Sleeping
Sleeping
import logging | |
import gradio as gr | |
from langchain_community.vectorstores import FAISS | |
import os | |
import PyPDF2 | |
from docx import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import json | |
from datetime import datetime | |
from app.functions.database_handling import BASE_DB_PATH | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from app.config import EMBEDDING_CONFIG, EMBEDDING_MODEL | |
from app.utils.embedding_utils import get_embeddings | |
from app.utils.dataclass_utils import DocumentMetadata, save_metadata | |
# -------------- UTILITY FUNCTIONS -------------- | |
def extract_text_from_pdf(file_path): | |
""" | |
Estrae il testo da un file PDF. | |
Args: | |
file_path: Percorso del file PDF | |
Returns: | |
str: Testo estratto dal PDF | |
""" | |
with open(file_path, 'rb') as f: | |
reader = PyPDF2.PdfReader(f) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def extract_text_from_docx(file_path): | |
""" | |
Estrae il testo da un file DOCX. | |
Args: | |
file_path: Percorso del file DOCX | |
Returns: | |
str: Testo estratto dal documento Word | |
""" | |
doc = Document(file_path) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
def create_chunks(text): | |
from app.config import EMBEDDING_CONFIG | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=EMBEDDING_CONFIG["chunk_size"], | |
chunk_overlap=EMBEDDING_CONFIG["chunk_overlap"], | |
length_function=len, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
return text_splitter.split_text(text) | |
def create_vectorstore(texts, metadatas, db_path): | |
embeddings = get_embeddings() | |
db = FAISS.from_texts(texts, embeddings, metadatas=metadatas) | |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- | |
def upload_and_index(files, title, author, db_name="default_db"): | |
if not files: | |
return "Nessun file caricato." | |
documents = [] | |
doc_metadata = [] | |
for file in files: | |
try: | |
if file.name.endswith('.pdf'): | |
text = extract_text_from_pdf(file.name) | |
elif file.name.endswith('.docx'): | |
text = extract_text_from_docx(file.name) | |
else: | |
# File .txt o altro testo semplice | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
chunks = create_chunks(text) | |
# Metadata per il documento | |
doc_meta = DocumentMetadata( | |
filename=os.path.basename(file.name), | |
title=title, | |
author=author, | |
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
chunks=len(chunks) | |
) | |
# Metadata per ogni chunk | |
for i, chunk in enumerate(chunks): | |
chunk_metadata = { | |
"content": chunk, | |
"source": os.path.basename(file.name), | |
"title": title, | |
"author": author, | |
"chunk_index": i, | |
"total_chunks": len(chunks), | |
"upload_date": doc_meta.upload_date | |
} | |
documents.append(chunk_metadata) | |
doc_metadata.append(doc_meta) | |
except Exception as e: | |
logging.error(f"Errore durante la lettura del file {file.name}: {e}") | |
continue | |
if documents: | |
try: | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui | |
os.makedirs(db_path, exist_ok=True) | |
# Usa la funzione centralizzata invece dell'inizializzazione diretta | |
embeddings = get_embeddings() | |
texts = [doc["content"] for doc in documents] | |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] | |
# Crea o sovrascrivi l'indice FAISS con questi documenti | |
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) | |
vectorstore.save_local(db_path) | |
# Salva i metadati del documento su file | |
save_metadata(doc_metadata, db_name) | |
return f"Documenti indicizzati con successo nel database '{db_name}'!" | |
except Exception as e: | |
logging.error(f"Errore durante l'indicizzazione: {e}") | |
return f"Errore durante l'indicizzazione: {e}" | |
return "Nessun documento processato." | |
def list_indexed_files(db_name="default_db"): | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui | |
metadata_file = os.path.join(db_path, "metadata.json") | |
if not os.path.exists(metadata_file): | |
return "Nessun file nel database." | |
try: | |
with open(metadata_file, 'r') as f: | |
metadata = json.load(f) | |
if not metadata: | |
return "Nessun documento nel database." | |
output = [] | |
for doc in metadata: | |
output.append( | |
f"📄 {doc['title']}\n" | |
f" Autore: {doc['author']}\n" | |
f" File: {doc['filename']}\n" | |
f" Chunks: {doc['chunks']}\n" | |
f" Caricato il: {doc['upload_date']}\n" | |
) | |
return "\n".join(output) if output else "Nessun documento nel database." | |
except Exception as e: | |
logging.error(f"Errore nella lettura dei metadati: {e}") | |
return f"Errore nella lettura dei metadati: {e}" | |
def delete_file_from_database(file_name, db_name="default_db"): | |
""" | |
Esempio semplificato: potresti voler rimuovere i chunk | |
da FAISS. Attualmente, la funzione gestisce un 'file_list.txt', | |
ma devi adattarla alle tue esigenze di rimozione dei chunk. | |
""" | |
db_path = os.path.join(BASE_DB_PATH, f"faiss_index_{db_name}") # Modifica qui | |
file_list_path = os.path.join(db_path, "file_list.txt") | |
if not os.path.exists(file_list_path): | |
return "Database non trovato (file_list.txt mancante)." | |
try: | |
# Leggi la lista dei file | |
with open(file_list_path, "r") as f: | |
files = f.readlines() | |
# Rimuovi il file dalla lista | |
files = [line.strip() for line in files if line.strip() != file_name] | |
# Riscrivi la lista aggiornata | |
with open(file_list_path, "w") as f: | |
for fl in files: | |
f.write(f"{fl}\n") | |
return f"File '{file_name}' rimosso dal database '{db_name}'." | |
except Exception as e: | |
return f"Errore durante la rimozione del file: {e}" | |