Spaces:
Sleeping
Sleeping
uploadfile
Browse files- app/utils/extract_utils.py +25 -9
- app/utils/helpers.py +49 -86
- db/.DS_Store +0 -0
app/utils/extract_utils.py
CHANGED
@@ -1,22 +1,38 @@
|
|
1 |
-
import
|
2 |
from docx import Document
|
|
|
|
|
3 |
|
4 |
def extract_text_from_pdf(file_path):
|
5 |
"""
|
6 |
-
Estrae il testo da un file PDF.
|
7 |
|
8 |
Args:
|
9 |
file_path: Percorso del file PDF
|
10 |
|
11 |
Returns:
|
12 |
-
str: Testo estratto dal PDF
|
13 |
"""
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def extract_text_from_docx(file_path):
|
22 |
"""
|
|
|
1 |
+
import pdfplumber
|
2 |
from docx import Document
|
3 |
+
import logging
|
4 |
+
import io
|
5 |
|
6 |
def extract_text_from_pdf(file_path):
|
7 |
"""
|
8 |
+
Estrae il testo da un file PDF usando pdfplumber.
|
9 |
|
10 |
Args:
|
11 |
file_path: Percorso del file PDF
|
12 |
|
13 |
Returns:
|
14 |
+
str: Testo estratto dal PDF o messaggio di errore
|
15 |
"""
|
16 |
+
try:
|
17 |
+
with pdfplumber.open(file_path) as pdf:
|
18 |
+
text = []
|
19 |
+
for page in pdf.pages:
|
20 |
+
try:
|
21 |
+
page_text = page.extract_text() or ""
|
22 |
+
text.append(page_text)
|
23 |
+
except Exception as e:
|
24 |
+
logging.warning(f"Errore nell'estrazione della pagina: {str(e)}")
|
25 |
+
continue
|
26 |
+
|
27 |
+
extracted_text = "\n".join(text)
|
28 |
+
if not extracted_text.strip():
|
29 |
+
raise ValueError("Nessun testo estratto dal PDF")
|
30 |
+
|
31 |
+
return extracted_text
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
logging.error(f"Errore nella lettura del PDF: {str(e)}")
|
35 |
+
raise
|
36 |
|
37 |
def extract_text_from_docx(file_path):
|
38 |
"""
|
app/utils/helpers.py
CHANGED
@@ -1,101 +1,64 @@
|
|
1 |
import logging
|
2 |
-
from app.document_handling import extract_text_from_pdf, extract_text_from_docx
|
3 |
-
import tempfile
|
4 |
import os
|
5 |
-
from datetime import datetime
|
6 |
import shutil
|
|
|
|
|
7 |
|
8 |
def extract_text_from_files(files):
|
9 |
-
"""
|
10 |
-
Estrae e concatena il testo da file PDF, DOCX e TXT.
|
11 |
-
|
12 |
-
Args:
|
13 |
-
files (list): Lista di file caricati.
|
14 |
-
|
15 |
-
Returns:
|
16 |
-
str: Testo concatenato estratto dai file.
|
17 |
-
"""
|
18 |
if not files:
|
19 |
-
logging.warning("Nessun file fornito")
|
20 |
return ""
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
#
|
25 |
-
temp_dir =
|
26 |
-
os.
|
27 |
-
|
28 |
-
|
29 |
-
text = ""
|
30 |
for file in files:
|
|
|
31 |
try:
|
32 |
-
|
33 |
-
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
else:
|
47 |
-
raise ValueError("Impossibile convertire l'oggetto in bytes")
|
48 |
-
file_path = temp_path
|
49 |
-
|
50 |
-
logging.info(f"File temporaneo creato: {file_path}")
|
51 |
-
|
52 |
try:
|
53 |
-
|
54 |
-
|
55 |
-
elif file_path.lower().endswith('.docx'):
|
56 |
-
text += extract_text_from_docx(file_path)
|
57 |
-
elif file_path.lower().endswith('.txt'):
|
58 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
59 |
-
text += f.read()
|
60 |
-
else:
|
61 |
-
logging.warning(f"Formato file non supportato: {file_path}")
|
62 |
except Exception as e:
|
63 |
-
logging.error(f"Errore
|
64 |
-
|
65 |
-
continue
|
66 |
-
|
67 |
-
# Gestione file direttamente caricati
|
68 |
-
if not hasattr(file, 'name') or not hasattr(file, 'read'):
|
69 |
-
logging.error(f"Oggetto file non valido: {type(file)}")
|
70 |
-
continue
|
71 |
-
|
72 |
-
# Salva il file originale in Temp_file
|
73 |
-
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
74 |
-
file_name = f"{timestamp}_{os.path.basename(file.name)}"
|
75 |
-
temp_path = os.path.join(temp_dir, file_name)
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
elif file_path.lower().endswith('.docx'):
|
87 |
-
text += extract_text_from_docx(file_path)
|
88 |
-
elif file_path.lower().endswith('.txt'):
|
89 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
90 |
-
text += f.read()
|
91 |
-
else:
|
92 |
-
logging.warning(f"Formato file non supportato: {file_path}")
|
93 |
-
|
94 |
-
if text and not text.endswith('\n\n'):
|
95 |
-
text += '\n\n'
|
96 |
-
|
97 |
except Exception as e:
|
98 |
-
logging.error(f"Errore durante l'elaborazione del file {
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
return
|
|
|
1 |
import logging
|
|
|
|
|
2 |
import os
|
|
|
3 |
import shutil
|
4 |
+
from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx
|
5 |
+
from datetime import datetime
|
6 |
|
7 |
def extract_text_from_files(files):
|
8 |
+
"""Estrae il testo dai file caricati."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
if not files:
|
|
|
10 |
return ""
|
11 |
|
12 |
+
extracted_text = []
|
13 |
+
|
14 |
+
# Usa il percorso assoluto della cartella Temp_file
|
15 |
+
temp_dir = "/Users/danieledragoni/hugginface/Edurag_beta/app/Temp_file"
|
16 |
+
if not os.path.exists(temp_dir):
|
17 |
+
os.makedirs(temp_dir)
|
18 |
+
|
|
|
19 |
for file in files:
|
20 |
+
temp_path = None
|
21 |
try:
|
22 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
23 |
+
|
24 |
+
# Crea un nome file univoco nella cartella Temp_file
|
25 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
26 |
+
temp_filename = f"temp_{timestamp}{file_extension}"
|
27 |
+
temp_path = os.path.join(temp_dir, temp_filename)
|
28 |
+
|
29 |
+
# Copia il file da Gradio alla nostra cartella Temp_file
|
30 |
+
shutil.copy2(file.name, temp_path)
|
31 |
+
|
32 |
+
logging.info(f"File temporaneo creato in: {temp_path}")
|
33 |
+
|
34 |
+
# Estrai il testo in base al tipo di file
|
35 |
+
if file_extension == '.pdf':
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
try:
|
37 |
+
text = extract_text_from_pdf(temp_path)
|
38 |
+
extracted_text.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
except Exception as e:
|
40 |
+
logging.error(f"Errore nell'elaborazione del PDF {file.name}: {str(e)}")
|
41 |
+
extracted_text.append(f"[Errore nell'elaborazione del PDF {file.name}. Dettaglio: {str(e)}]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
elif file_extension == '.docx':
|
44 |
+
text = extract_text_from_docx(temp_path)
|
45 |
+
extracted_text.append(text)
|
46 |
+
|
47 |
+
elif file_extension == '.txt':
|
48 |
+
with open(temp_path, 'r', encoding='utf-8') as f:
|
49 |
+
text = f.read()
|
50 |
+
extracted_text.append(text)
|
51 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
except Exception as e:
|
53 |
+
logging.error(f"Errore durante l'elaborazione del file {file.name}: {str(e)}")
|
54 |
+
extracted_text.append(f"[Errore nell'elaborazione del file {file.name}]")
|
55 |
+
|
56 |
+
finally:
|
57 |
+
# Pulisci il file temporaneo
|
58 |
+
if temp_path and os.path.exists(temp_path):
|
59 |
+
try:
|
60 |
+
os.remove(temp_path)
|
61 |
+
except Exception as e:
|
62 |
+
logging.error(f"Errore nella pulizia del file temporaneo: {str(e)}")
|
63 |
|
64 |
+
return "\n\n".join(extracted_text)
|
db/.DS_Store
CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ
|
|