Spaces:
Running
Running
import logging | |
import gradio as gr # Aggiunto import mancante | |
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEmbeddings | |
import os | |
import shutil | |
import PyPDF2 | |
from docx import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from dataclasses import dataclass | |
import json | |
from datetime import datetime | |
# Initialize the text splitter | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
# -------------- UTILITY FUNCTIONS -------------- | |
class DocumentMetadata: | |
filename: str | |
title: str | |
author: str | |
upload_date: str | |
chunks: int | |
def to_dict(self): | |
return { | |
"filename": self.filename, | |
"title": self.title, | |
"author": self.author, | |
"upload_date": self.upload_date, | |
"chunks": self.chunks | |
} | |
def save_metadata(metadata_list, db_name): | |
db_path = f"faiss_index_{db_name}" | |
metadata_file = os.path.join(db_path, "metadata.json") | |
existing_metadata = [] | |
if os.path.exists(metadata_file): | |
with open(metadata_file, 'r') as f: | |
existing_metadata = json.load(f) | |
existing_metadata.extend([m.to_dict() for m in metadata_list]) | |
with open(metadata_file, 'w') as f: | |
json.dump(existing_metadata, f, indent=2) | |
def extract_text_from_pdf(file_path): | |
with open(file_path, 'rb') as f: | |
reader = PyPDF2.PdfReader(f) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def extract_text_from_docx(file_path): | |
doc = Document(file_path) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
# -------------- CHATBOT TAB FUNCTIONS -------------- | |
def answer_question(question, db_name="default_db"): | |
db_path = f"faiss_index_{db_name}" | |
if not os.path.exists(db_path): | |
logging.warning(f"L'indice FAISS per il database {db_name} non esiste.") | |
return "Database non trovato." | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) | |
# Perform a similarity search | |
docs = vectorstore.similarity_search(question) | |
if not docs: | |
return "Nessun documento corrispondente alla query." | |
# Collect the document contents | |
results = [doc.page_content for doc in docs] | |
return "\n\n".join(results) | |
# -------------- DATABASE MANAGEMENT TAB FUNCTIONS -------------- | |
def create_database(db_name): | |
logging.info(f"Creating database: {db_name}") | |
db_path = f"faiss_index_{db_name}" | |
if os.path.exists(db_path): | |
return f"Il database {db_name} esiste giΓ ." | |
try: | |
os.makedirs(db_path) | |
logging.info(f"Database {db_name} created successfully.") | |
databases = list_databases() | |
return (f"Database {db_name} creato con successo.", databases) | |
except Exception as e: | |
logging.error(f"Errore nella creazione del database: {e}") | |
return (f"Errore nella creazione del database: {e}", []) | |
def delete_database(db_name): | |
db_path = f"faiss_index_{db_name}" | |
if not os.path.exists(db_path): | |
return f"Il database {db_name} non esiste." | |
try: | |
shutil.rmtree(db_path) | |
logging.info(f"Database {db_name} eliminato con successo.") | |
return f"Database {db_name} eliminato con successo." | |
except OSError as e: | |
logging.error(f"Impossibile eliminare il database {db_name}: {e}") | |
return f"Impossibile eliminare il database {db_name}: {e}" | |
def modify_database(old_db_name, new_db_name): | |
old_db_path = f"faiss_index_{old_db_name}" | |
new_db_path = f"faiss_index_{new_db_name}" | |
if not os.path.exists(old_db_path): | |
return f"Il database {old_db_name} non esiste." | |
if os.path.exists(new_db_path): | |
return f"Il database {new_db_name} esiste giΓ ." | |
try: | |
os.rename(old_db_path, new_db_path) | |
return f"Database {old_db_name} rinominato in {new_db_name} con successo." | |
except Exception as e: | |
return f"Errore durante la modifica del database: {e}" | |
def list_databases(): | |
try: | |
databases = [] | |
for item in os.listdir(): | |
if os.path.isdir(item) and item.startswith("faiss_index_"): | |
db_name = item.replace("faiss_index_", "") | |
databases.append(db_name) | |
# Ensure "default_db" is in the list | |
if "default_db" not in databases: | |
databases.append("default_db") | |
return databases | |
except Exception as e: | |
logging.error(f"Error listing databases: {e}") | |
return [] | |
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS -------------- | |
def upload_and_index(files, title, author, db_name="default_db"): | |
if not files: | |
return "Nessun file caricato." | |
documents = [] | |
doc_metadata = [] | |
for file in files: | |
try: | |
if file.name.endswith('.pdf'): | |
text = extract_text_from_pdf(file.name) | |
elif file.name.endswith('.docx'): | |
text = extract_text_from_docx(file.name) | |
else: | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
chunks = text_splitter.split_text(text) | |
# Metadata per il documento | |
doc_meta = DocumentMetadata( | |
filename=os.path.basename(file.name), | |
title=title, | |
author=author, | |
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
chunks=len(chunks) | |
) | |
# Metadata per ogni chunk | |
for i, chunk in enumerate(chunks): | |
chunk_metadata = { | |
"content": chunk, | |
"source": os.path.basename(file.name), | |
"title": title, | |
"author": author, | |
"chunk_index": i, | |
"total_chunks": len(chunks), | |
"upload_date": doc_meta.upload_date | |
} | |
documents.append(chunk_metadata) | |
doc_metadata.append(doc_meta) | |
except Exception as e: | |
logging.error(f"Errore durante la lettura del file {file.name}: {e}") | |
continue | |
if documents: | |
try: | |
db_path = f"faiss_index_{db_name}" | |
os.makedirs(db_path, exist_ok=True) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
texts = [doc["content"] for doc in documents] | |
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents] | |
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas) | |
vectorstore.save_local(db_path) | |
# Salva i metadati del documento | |
save_metadata(doc_metadata, db_name) | |
return f"Documenti indicizzati con successo nel database {db_name}!" | |
except Exception as e: | |
logging.error(f"Errore durante l'indicizzazione: {e}") | |
return f"Errore durante l'indicizzazione: {e}" | |
return "Nessun documento processato." | |
def list_indexed_files(db_name="default_db"): | |
db_path = f"faiss_index_{db_name}" | |
metadata_file = os.path.join(db_path, "metadata.json") | |
if not os.path.exists(metadata_file): | |
return "Nessun file nel database." | |
try: | |
with open(metadata_file, 'r') as f: | |
metadata = json.load(f) | |
output = [] | |
for doc in metadata: | |
output.append( | |
f"π {doc['title']}\n" | |
f" Autore: {doc['author']}\n" | |
f" File: {doc['filename']}\n" | |
f" Chunks: {doc['chunks']}\n" | |
f" Caricato il: {doc['upload_date']}\n" | |
) | |
return "\n".join(output) if output else "Nessun documento nel database." | |
except Exception as e: | |
logging.error(f"Errore nella lettura dei metadati: {e}") | |
return f"Errore nella lettura dei metadati: {e}" | |
def delete_file_from_database(file_name, db_name="default_db"): | |
db_path = f"faiss_index_{db_name}" | |
file_list_path = os.path.join(db_path, "file_list.txt") | |
if not os.path.exists(file_list_path): | |
return "Database non trovato." | |
try: | |
# Leggi la lista dei file | |
with open(file_list_path, "r") as f: | |
files = f.readlines() | |
# Rimuovi il file dalla lista | |
files = [f.strip() for f in files if f.strip() != file_name] | |
# Riscrivi la lista aggiornata | |
with open(file_list_path, "w") as f: | |
for file in files: | |
f.write(f"{file}\n") | |
return f"File {file_name} rimosso dal database {db_name}." | |
except Exception as e: | |
return f"Errore durante la rimozione del file: {e}" | |
# -------------- DOCUMENT VISUALIZATION TAB FUNCTIONS -------------- | |
def list_indexed_documents(db_name="default_db"): | |
db_path = f"faiss_index_{db_name}" | |
metadata_file = os.path.join(db_path, "metadata.json") | |
if not os.path.exists(db_path): | |
return f"Il database {db_name} non esiste." | |
if not os.path.exists(metadata_file): | |
return f"Nessun documento nel database {db_name}." | |
try: | |
with open(metadata_file, 'r') as f: | |
metadata = json.load(f) | |
if not metadata: | |
return "Nessun documento trovato nel database." | |
output_lines = ["π Documenti nel database:"] | |
for doc in metadata: | |
output_lines.extend([ | |
f"\nπ Documento: {doc['title']}", | |
f" π Autore: {doc['author']}", | |
f" π File: {doc['filename']}", | |
f" π Caricato il: {doc['upload_date']}", | |
f" π Chunks: {doc['chunks']}" | |
]) | |
result = "\n".join(output_lines) | |
logging.info(f"Documenti trovati nel database {db_name}: {result}") | |
return result | |
except Exception as e: | |
error_msg = f"Errore nella lettura dei metadati: {e}" | |
logging.error(error_msg) | |
return error_msg | |
# -------------- NEW FEATURES TAB FUNCTIONS -------------- | |
def search_documents(query, db_name="default_db"): | |
db_path = f"faiss_index_{db_name}" | |
if not os.path.exists(db_path): | |
logging.warning(f"L'indice FAISS per il database {db_name} non esiste.") | |
return "Database non trovato." | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) | |
# Perform a similarity search | |
docs = vectorstore.similarity_search(query) | |
if not docs: | |
return "Nessun documento corrispondente alla query." | |
# Collect the document contents | |
results = [doc.page_content for doc in docs] | |
return "\n\n".join(results) | |
def generate_summary(db_name="default_db"): | |
# Placeholder for summarization logic | |
return "This is a summary of the documents in the database." |