Nugh75 commited on
Commit
a5de847
·
1 Parent(s): e6b7117

uploadfile

Browse files
Files changed (3) hide show
  1. app/utils/extract_utils.py +25 -9
  2. app/utils/helpers.py +49 -86
  3. db/.DS_Store +0 -0
app/utils/extract_utils.py CHANGED
@@ -1,22 +1,38 @@
1
- import PyPDF2
2
  from docx import Document
 
 
3
 
4
  def extract_text_from_pdf(file_path):
5
  """
6
- Estrae il testo da un file PDF.
7
 
8
  Args:
9
  file_path: Percorso del file PDF
10
 
11
  Returns:
12
- str: Testo estratto dal PDF
13
  """
14
- with open(file_path, 'rb') as f:
15
- reader = PyPDF2.PdfReader(f)
16
- text = ""
17
- for page in reader.pages:
18
- text += page.extract_text()
19
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def extract_text_from_docx(file_path):
22
  """
 
1
+ import pdfplumber
2
  from docx import Document
3
+ import logging
4
+ import io
5
 
6
  def extract_text_from_pdf(file_path):
7
  """
8
+ Estrae il testo da un file PDF usando pdfplumber.
9
 
10
  Args:
11
  file_path: Percorso del file PDF
12
 
13
  Returns:
14
+ str: Testo estratto dal PDF o messaggio di errore
15
  """
16
+ try:
17
+ with pdfplumber.open(file_path) as pdf:
18
+ text = []
19
+ for page in pdf.pages:
20
+ try:
21
+ page_text = page.extract_text() or ""
22
+ text.append(page_text)
23
+ except Exception as e:
24
+ logging.warning(f"Errore nell'estrazione della pagina: {str(e)}")
25
+ continue
26
+
27
+ extracted_text = "\n".join(text)
28
+ if not extracted_text.strip():
29
+ raise ValueError("Nessun testo estratto dal PDF")
30
+
31
+ return extracted_text
32
+
33
+ except Exception as e:
34
+ logging.error(f"Errore nella lettura del PDF: {str(e)}")
35
+ raise
36
 
37
  def extract_text_from_docx(file_path):
38
  """
app/utils/helpers.py CHANGED
@@ -1,101 +1,64 @@
1
  import logging
2
- from app.document_handling import extract_text_from_pdf, extract_text_from_docx
3
- import tempfile
4
  import os
5
- from datetime import datetime
6
  import shutil
 
 
7
 
8
  def extract_text_from_files(files):
9
- """
10
- Estrae e concatena il testo da file PDF, DOCX e TXT.
11
-
12
- Args:
13
- files (list): Lista di file caricati.
14
-
15
- Returns:
16
- str: Testo concatenato estratto dai file.
17
- """
18
  if not files:
19
- logging.warning("Nessun file fornito")
20
  return ""
21
 
22
- logging.info(f"Ricevuti {len(files)} file da elaborare")
23
-
24
- # Crea la cartella Temp_file se non esiste
25
- temp_dir = os.path.join(os.path.dirname(__file__), '..', 'Temp_file')
26
- os.makedirs(temp_dir, exist_ok=True)
27
- logging.info(f"Cartella Temp_file: {temp_dir}")
28
-
29
- text = ""
30
  for file in files:
 
31
  try:
32
- file_path = None
33
-
34
- # Gestione degli oggetti NamedString di Gradio
35
- if type(file).__name__ == 'NamedString':
36
- original_name = getattr(file, 'name', 'file')
37
- _, ext = os.path.splitext(original_name)
38
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
39
- temp_path = os.path.join(temp_dir, f"temp_{timestamp}{ext if ext else '.txt'}")
40
-
41
- with open(temp_path, 'wb') as tmp_file:
42
- if hasattr(file, 'encode'):
43
- tmp_file.write(file.encode())
44
- elif hasattr(file, 'read'):
45
- tmp_file.write(file.read())
46
- else:
47
- raise ValueError("Impossibile convertire l'oggetto in bytes")
48
- file_path = temp_path
49
-
50
- logging.info(f"File temporaneo creato: {file_path}")
51
-
52
  try:
53
- if file_path.lower().endswith('.pdf'):
54
- text += extract_text_from_pdf(file_path)
55
- elif file_path.lower().endswith('.docx'):
56
- text += extract_text_from_docx(file_path)
57
- elif file_path.lower().endswith('.txt'):
58
- with open(file_path, 'r', encoding='utf-8') as f:
59
- text += f.read()
60
- else:
61
- logging.warning(f"Formato file non supportato: {file_path}")
62
  except Exception as e:
63
- logging.error(f"Errore durante l'elaborazione del file {file_path}: {str(e)}")
64
-
65
- continue
66
-
67
- # Gestione file direttamente caricati
68
- if not hasattr(file, 'name') or not hasattr(file, 'read'):
69
- logging.error(f"Oggetto file non valido: {type(file)}")
70
- continue
71
-
72
- # Salva il file originale in Temp_file
73
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
74
- file_name = f"{timestamp}_{os.path.basename(file.name)}"
75
- temp_path = os.path.join(temp_dir, file_name)
76
 
77
- # Salva il contenuto del file
78
- with open(temp_path, 'wb') as f:
79
- f.write(file.read())
80
-
81
- logging.info(f"File salvato in Temp_file: {temp_path}")
82
- file_path = temp_path
83
-
84
- if file_path.lower().endswith('.pdf'):
85
- text += extract_text_from_pdf(file_path)
86
- elif file_path.lower().endswith('.docx'):
87
- text += extract_text_from_docx(file_path)
88
- elif file_path.lower().endswith('.txt'):
89
- with open(file_path, 'r', encoding='utf-8') as f:
90
- text += f.read()
91
- else:
92
- logging.warning(f"Formato file non supportato: {file_path}")
93
-
94
- if text and not text.endswith('\n\n'):
95
- text += '\n\n'
96
-
97
  except Exception as e:
98
- logging.error(f"Errore durante l'elaborazione del file {file_path if file_path else 'unknown'}: {str(e)}")
99
- continue
 
 
 
 
 
 
 
 
100
 
101
- return text.strip()
 
1
  import logging
 
 
2
  import os
 
3
  import shutil
4
+ from app.utils.extract_utils import extract_text_from_pdf, extract_text_from_docx
5
+ from datetime import datetime
6
 
7
  def extract_text_from_files(files):
8
+ """Estrae il testo dai file caricati."""
 
 
 
 
 
 
 
 
9
  if not files:
 
10
  return ""
11
 
12
+ extracted_text = []
13
+
14
+ # Usa il percorso assoluto della cartella Temp_file
15
+ temp_dir = "/Users/danieledragoni/hugginface/Edurag_beta/app/Temp_file"
16
+ if not os.path.exists(temp_dir):
17
+ os.makedirs(temp_dir)
18
+
 
19
  for file in files:
20
+ temp_path = None
21
  try:
22
+ file_extension = os.path.splitext(file.name)[1].lower()
23
+
24
+ # Crea un nome file univoco nella cartella Temp_file
25
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
26
+ temp_filename = f"temp_{timestamp}{file_extension}"
27
+ temp_path = os.path.join(temp_dir, temp_filename)
28
+
29
+ # Copia il file da Gradio alla nostra cartella Temp_file
30
+ shutil.copy2(file.name, temp_path)
31
+
32
+ logging.info(f"File temporaneo creato in: {temp_path}")
33
+
34
+ # Estrai il testo in base al tipo di file
35
+ if file_extension == '.pdf':
 
 
 
 
 
 
36
  try:
37
+ text = extract_text_from_pdf(temp_path)
38
+ extracted_text.append(text)
 
 
 
 
 
 
 
39
  except Exception as e:
40
+ logging.error(f"Errore nell'elaborazione del PDF {file.name}: {str(e)}")
41
+ extracted_text.append(f"[Errore nell'elaborazione del PDF {file.name}. Dettaglio: {str(e)}]")
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ elif file_extension == '.docx':
44
+ text = extract_text_from_docx(temp_path)
45
+ extracted_text.append(text)
46
+
47
+ elif file_extension == '.txt':
48
+ with open(temp_path, 'r', encoding='utf-8') as f:
49
+ text = f.read()
50
+ extracted_text.append(text)
51
+
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
+ logging.error(f"Errore durante l'elaborazione del file {file.name}: {str(e)}")
54
+ extracted_text.append(f"[Errore nell'elaborazione del file {file.name}]")
55
+
56
+ finally:
57
+ # Pulisci il file temporaneo
58
+ if temp_path and os.path.exists(temp_path):
59
+ try:
60
+ os.remove(temp_path)
61
+ except Exception as e:
62
+ logging.error(f"Errore nella pulizia del file temporaneo: {str(e)}")
63
 
64
+ return "\n\n".join(extracted_text)
db/.DS_Store CHANGED
Binary files a/db/.DS_Store and b/db/.DS_Store differ