Edurag_beta / app /document_handling.py
Nugh75's picture
update ristrutturazione file app.py con divisione file
47e4aa2
raw
history blame
11.4 kB
import logging
import gradio as gr # Aggiunto import mancante
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil
import PyPDF2
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dataclasses import dataclass
import json
from datetime import datetime
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# -------------- UTILITY FUNCTIONS --------------
@dataclass
class DocumentMetadata:
filename: str
title: str
author: str
upload_date: str
chunks: int
def to_dict(self):
return {
"filename": self.filename,
"title": self.title,
"author": self.author,
"upload_date": self.upload_date,
"chunks": self.chunks
}
def save_metadata(metadata_list, db_name):
db_path = f"faiss_index_{db_name}"
metadata_file = os.path.join(db_path, "metadata.json")
existing_metadata = []
if os.path.exists(metadata_file):
with open(metadata_file, 'r') as f:
existing_metadata = json.load(f)
existing_metadata.extend([m.to_dict() for m in metadata_list])
with open(metadata_file, 'w') as f:
json.dump(existing_metadata, f, indent=2)
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def extract_text_from_docx(file_path):
doc = Document(file_path)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
# -------------- CHATBOT TAB FUNCTIONS --------------
def answer_question(question, db_name="default_db"):
db_path = f"faiss_index_{db_name}"
if not os.path.exists(db_path):
logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
return "Database non trovato."
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
# Perform a similarity search
docs = vectorstore.similarity_search(question)
if not docs:
return "Nessun documento corrispondente alla query."
# Collect the document contents
results = [doc.page_content for doc in docs]
return "\n\n".join(results)
# -------------- DATABASE MANAGEMENT TAB FUNCTIONS --------------
def create_database(db_name):
logging.info(f"Creating database: {db_name}")
db_path = f"faiss_index_{db_name}"
if os.path.exists(db_path):
return f"Il database {db_name} esiste giΓ ."
try:
os.makedirs(db_path)
logging.info(f"Database {db_name} created successfully.")
databases = list_databases()
return (f"Database {db_name} creato con successo.", databases)
except Exception as e:
logging.error(f"Errore nella creazione del database: {e}")
return (f"Errore nella creazione del database: {e}", [])
def delete_database(db_name):
db_path = f"faiss_index_{db_name}"
if not os.path.exists(db_path):
return f"Il database {db_name} non esiste."
try:
shutil.rmtree(db_path)
logging.info(f"Database {db_name} eliminato con successo.")
return f"Database {db_name} eliminato con successo."
except OSError as e:
logging.error(f"Impossibile eliminare il database {db_name}: {e}")
return f"Impossibile eliminare il database {db_name}: {e}"
def modify_database(old_db_name, new_db_name):
old_db_path = f"faiss_index_{old_db_name}"
new_db_path = f"faiss_index_{new_db_name}"
if not os.path.exists(old_db_path):
return f"Il database {old_db_name} non esiste."
if os.path.exists(new_db_path):
return f"Il database {new_db_name} esiste giΓ ."
try:
os.rename(old_db_path, new_db_path)
return f"Database {old_db_name} rinominato in {new_db_name} con successo."
except Exception as e:
return f"Errore durante la modifica del database: {e}"
def list_databases():
try:
databases = []
for item in os.listdir():
if os.path.isdir(item) and item.startswith("faiss_index_"):
db_name = item.replace("faiss_index_", "")
databases.append(db_name)
# Ensure "default_db" is in the list
if "default_db" not in databases:
databases.append("default_db")
return databases
except Exception as e:
logging.error(f"Error listing databases: {e}")
return []
# -------------- DOCUMENT MANAGEMENT TAB FUNCTIONS --------------
def upload_and_index(files, title, author, db_name="default_db"):
if not files:
return "Nessun file caricato."
documents = []
doc_metadata = []
for file in files:
try:
if file.name.endswith('.pdf'):
text = extract_text_from_pdf(file.name)
elif file.name.endswith('.docx'):
text = extract_text_from_docx(file.name)
else:
with open(file.name, 'r', encoding='utf-8') as f:
text = f.read()
chunks = text_splitter.split_text(text)
# Metadata per il documento
doc_meta = DocumentMetadata(
filename=os.path.basename(file.name),
title=title,
author=author,
upload_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
chunks=len(chunks)
)
# Metadata per ogni chunk
for i, chunk in enumerate(chunks):
chunk_metadata = {
"content": chunk,
"source": os.path.basename(file.name),
"title": title,
"author": author,
"chunk_index": i,
"total_chunks": len(chunks),
"upload_date": doc_meta.upload_date
}
documents.append(chunk_metadata)
doc_metadata.append(doc_meta)
except Exception as e:
logging.error(f"Errore durante la lettura del file {file.name}: {e}")
continue
if documents:
try:
db_path = f"faiss_index_{db_name}"
os.makedirs(db_path, exist_ok=True)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
texts = [doc["content"] for doc in documents]
metadatas = [{k: v for k, v in doc.items() if k != "content"} for doc in documents]
vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
vectorstore.save_local(db_path)
# Salva i metadati del documento
save_metadata(doc_metadata, db_name)
return f"Documenti indicizzati con successo nel database {db_name}!"
except Exception as e:
logging.error(f"Errore durante l'indicizzazione: {e}")
return f"Errore durante l'indicizzazione: {e}"
return "Nessun documento processato."
def list_indexed_files(db_name="default_db"):
db_path = f"faiss_index_{db_name}"
metadata_file = os.path.join(db_path, "metadata.json")
if not os.path.exists(metadata_file):
return "Nessun file nel database."
try:
with open(metadata_file, 'r') as f:
metadata = json.load(f)
output = []
for doc in metadata:
output.append(
f"πŸ“„ {doc['title']}\n"
f" Autore: {doc['author']}\n"
f" File: {doc['filename']}\n"
f" Chunks: {doc['chunks']}\n"
f" Caricato il: {doc['upload_date']}\n"
)
return "\n".join(output) if output else "Nessun documento nel database."
except Exception as e:
logging.error(f"Errore nella lettura dei metadati: {e}")
return f"Errore nella lettura dei metadati: {e}"
def delete_file_from_database(file_name, db_name="default_db"):
db_path = f"faiss_index_{db_name}"
file_list_path = os.path.join(db_path, "file_list.txt")
if not os.path.exists(file_list_path):
return "Database non trovato."
try:
# Leggi la lista dei file
with open(file_list_path, "r") as f:
files = f.readlines()
# Rimuovi il file dalla lista
files = [f.strip() for f in files if f.strip() != file_name]
# Riscrivi la lista aggiornata
with open(file_list_path, "w") as f:
for file in files:
f.write(f"{file}\n")
return f"File {file_name} rimosso dal database {db_name}."
except Exception as e:
return f"Errore durante la rimozione del file: {e}"
# -------------- DOCUMENT VISUALIZATION TAB FUNCTIONS --------------
def list_indexed_documents(db_name="default_db"):
db_path = f"faiss_index_{db_name}"
metadata_file = os.path.join(db_path, "metadata.json")
if not os.path.exists(db_path):
return f"Il database {db_name} non esiste."
if not os.path.exists(metadata_file):
return f"Nessun documento nel database {db_name}."
try:
with open(metadata_file, 'r') as f:
metadata = json.load(f)
if not metadata:
return "Nessun documento trovato nel database."
output_lines = ["πŸ“š Documenti nel database:"]
for doc in metadata:
output_lines.extend([
f"\nπŸ“„ Documento: {doc['title']}",
f" πŸ“ Autore: {doc['author']}",
f" πŸ“ File: {doc['filename']}",
f" πŸ•’ Caricato il: {doc['upload_date']}",
f" πŸ“‘ Chunks: {doc['chunks']}"
])
result = "\n".join(output_lines)
logging.info(f"Documenti trovati nel database {db_name}: {result}")
return result
except Exception as e:
error_msg = f"Errore nella lettura dei metadati: {e}"
logging.error(error_msg)
return error_msg
# -------------- NEW FEATURES TAB FUNCTIONS --------------
def search_documents(query, db_name="default_db"):
db_path = f"faiss_index_{db_name}"
if not os.path.exists(db_path):
logging.warning(f"L'indice FAISS per il database {db_name} non esiste.")
return "Database non trovato."
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
# Perform a similarity search
docs = vectorstore.similarity_search(query)
if not docs:
return "Nessun documento corrispondente alla query."
# Collect the document contents
results = [doc.page_content for doc in docs]
return "\n\n".join(results)
def generate_summary(db_name="default_db"):
# Placeholder for summarization logic
return "This is a summary of the documents in the database."