Spaces:

Nugh75
/

Edurag_beta

Sleeping

App Files Files Community

Nugh75 commited on Jan 7

Commit

a5de847

1 Parent(s): e6b7117

uploadfile

Browse files

Files changed (3) hide show

app/utils/extract_utils.py +25 -9
app/utils/helpers.py +49 -86
db/.DS_Store +0 -0

app/utils/extract_utils.py CHANGED Viewed

@@ -1,22 +1,38 @@
-import PyPDF2
 from docx import Document
 def extract_text_from_pdf(file_path):
     """
-    Estrae il testo da un file PDF.
     Args:
         file_path: Percorso del file PDF
     Returns:
-        str: Testo estratto dal PDF
     """
-    with open(file_path, 'rb') as f:
-        reader = PyPDF2.PdfReader(f)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
-        return text
 def extract_text_from_docx(file_path):
     """

+import pdfplumber
 from docx import Document
+import logging
+import io
 def extract_text_from_pdf(file_path):
     """
+    Estrae il testo da un file PDF usando pdfplumber.
     Args:
         file_path: Percorso del file PDF
     Returns:
+        str: Testo estratto dal PDF o messaggio di errore
     """
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            text = []
+            for page in pdf.pages:
+                try:
+                    page_text = page.extract_text() or ""
+                    text.append(page_text)
+                except Exception as e:
+                    logging.warning(f"Errore nell'estrazione della pagina: {str(e)}")
+                    continue
+            extracted_text = "\n".join(text)
+            if not extracted_text.strip():
+                raise ValueError("Nessun testo estratto dal PDF")
+            return extracted_text
+    except Exception as e:
+        logging.error(f"Errore nella lettura del PDF: {str(e)}")
+        raise
 def extract_text_from_docx(file_path):
     """

app/utils/helpers.py CHANGED Viewed

@@ -1,101 +1,64 @@
 import logging
-from app.document_handling import extract_text_from_pdf, extract_text_from_docx
-import tempfile
 import os
-from datetime import datetime
 import shutil
 def extract_text_from_files(files):
-    """
-    Estrae e concatena il testo da file PDF, DOCX e TXT.
-    Args:
-        files (list): Lista di file caricati.
-    Returns:
-        str: Testo concatenato estratto dai file.
-    """
     if not files:
-        logging.warning("Nessun file fornito")
         return ""
-    logging.info(f"Ricevuti {len(files)} file da elaborare")
-    # Crea la cartella Temp_file se non esiste
-    temp_dir = os.path.join(os.path.dirname(__file__), '..', 'Temp_file')
-    os.makedirs(temp_dir, exist_ok=True)
-    logging.info(f"Cartella Temp_file: {temp_dir}")
-    text = ""
     for file in files:
         try:
-            file_path = None
-            # Gestione degli oggetti NamedString di Gradio
-            if type(file).__name__ == 'NamedString':
-                original_name = getattr(file, 'name', 'file')
-                _, ext = os.path.splitext(original_name)
-                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-                temp_path = os.path.join(temp_dir, f"temp_{timestamp}{ext if ext else '.txt'}")
-                with open(temp_path, 'wb') as tmp_file:
-                    if hasattr(file, 'encode'):
-                        tmp_file.write(file.encode())
-                    elif hasattr(file, 'read'):
-                        tmp_file.write(file.read())
-                    else:
-                        raise ValueError("Impossibile convertire l'oggetto in bytes")
-                    file_path = temp_path
-                logging.info(f"File temporaneo creato: {file_path}")
                 try:
-                    if file_path.lower().endswith('.pdf'):
-                        text += extract_text_from_pdf(file_path)
-                    elif file_path.lower().endswith('.docx'):
-                        text += extract_text_from_docx(file_path)
-                    elif file_path.lower().endswith('.txt'):
-                        with open(file_path, 'r', encoding='utf-8') as f:
-                            text += f.read()
-                    else:
-                        logging.warning(f"Formato file non supportato: {file_path}")
                 except Exception as e:
-                    logging.error(f"Errore durante l'elaborazione del file {file_path}: {str(e)}")
-                continue
-            # Gestione file direttamente caricati
-            if not hasattr(file, 'name') or not hasattr(file, 'read'):
-                logging.error(f"Oggetto file non valido: {type(file)}")
-                continue
-            # Salva il file originale in Temp_file
-            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-            file_name = f"{timestamp}_{os.path.basename(file.name)}"
-            temp_path = os.path.join(temp_dir, file_name)
-            # Salva il contenuto del file
-            with open(temp_path, 'wb') as f:
-                f.write(file.read())
-            logging.info(f"File salvato in Temp_file: {temp_path}")
-            file_path = temp_path
-            if file_path.lower().endswith('.pdf'):
-                text += extract_text_from_pdf(file_path)
-            elif file_path.lower().endswith('.docx'):
-                text += extract_text_from_docx(file_path)
-            elif file_path.lower().endswith('.txt'):
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    text += f.read()
-            else:
-                logging.warning(f"Formato file non supportato: {file_path}")
-            if text and not text.endswith('\n\n'):
-                text += '\n\n'
         except Exception as e:
-            logging.error(f"Errore durante l'elaborazione del file {file_path if file_path else 'unknown'}: {str(e)}")
-            continue
-    return text.strip()

 import logging
 import os
 import shutil
+from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx
+from datetime import datetime
 def extract_text_from_files(files):
+    """Estrae il testo dai file caricati."""
     if not files:
         return ""
+    extracted_text = []
+    # Usa il percorso assoluto della cartella Temp_file
+    temp_dir = "/Users/danieledragoni/hugginface/Edurag_beta/app/Temp_file"
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
     for file in files:
+        temp_path = None
         try:
+            file_extension = os.path.splitext(file.name)[1].lower()
+            # Crea un nome file univoco nella cartella Temp_file
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            temp_filename = f"temp_{timestamp}{file_extension}"
+            temp_path = os.path.join(temp_dir, temp_filename)
+            # Copia il file da Gradio alla nostra cartella Temp_file
+            shutil.copy2(file.name, temp_path)
+            logging.info(f"File temporaneo creato in: {temp_path}")
+            # Estrai il testo in base al tipo di file
+            if file_extension == '.pdf':
                 try:
+                    text = extract_text_from_pdf(temp_path)
+                    extracted_text.append(text)
                 except Exception as e:
+                    logging.error(f"Errore nell'elaborazione del PDF {file.name}: {str(e)}")
+                    extracted_text.append(f"[Errore nell'elaborazione del PDF {file.name}. Dettaglio: {str(e)}]")
+            elif file_extension == '.docx':
+                text = extract_text_from_docx(temp_path)
+                extracted_text.append(text)
+            elif file_extension == '.txt':
+                with open(temp_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+                    extracted_text.append(text)
         except Exception as e:
+            logging.error(f"Errore durante l'elaborazione del file {file.name}: {str(e)}")
+            extracted_text.append(f"[Errore nell'elaborazione del file {file.name}]")
+        finally:
+            # Pulisci il file temporaneo
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.remove(temp_path)
+                except Exception as e:
+                    logging.error(f"Errore nella pulizia del file temporaneo: {str(e)}")
+    return "\n\n".join(extracted_text)

db/.DS_Store CHANGED Viewed

Binary files a/db/.DS_Store and b/db/.DS_Store differ