Spaces:

pippobertin
/

Ordina_Documenti

Sleeping

App Files Files Community

pippobertin commited on Jan 15, 2024

Commit

2053f0e

verified ·

1 Parent(s): ecfa7fd

Update spacy2.py

Browse files

Files changed (1) hide show

spacy2.py +0 -22

spacy2.py CHANGED Viewed

@@ -2,13 +2,6 @@ import spacy
 import shutil
 import os
 import pdfplumber
-import pytesseract
-from PIL import Image
-from pdf2image import convert_from_path
-os.environ["PATH"] += os.pathsep + "https://huggingface.co/spaces/pippobertin/Ordina_Documenti/tree/main/poppler-24.01.0"
-# Configura il percorso di tesseract
-pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
 # Carica il modello linguistico italiano di spaCy
 nlp = spacy.load('it_core_news_sm')
@@ -42,17 +35,6 @@ def estrai_testo_da_pdf_con_pdfplumber(file_path):
         print(f"Errore nell'apertura o elaborazione del file PDF: {e}")
     return text.lower()
-def estrai_testo_da_pdf_con_ocr(file_path):
-    text = ''
-    pages = convert_from_path(file_path)
-    for page in pages:
-        text += pytesseract.image_to_string(page, lang='ita')
-    return text.lower()
-def estrai_testo_da_immagine(file_path):
-    image = Image.open(file_path)
-    return pytesseract.image_to_string(image, lang='ita').lower()
 def assegna_categoria_con_spacy(testo):
     doc = nlp(testo)
     for token in doc:
@@ -68,10 +50,6 @@ def classifica_e_sposta_documenti(cartella_origine, file_name):
     # Determina il tipo di file e estrai il testo appropriatamente
     if file_path.endswith('.pdf'):
         testo = estrai_testo_da_pdf_con_pdfplumber(file_path)
-        if not testo:
-            testo = estrai_testo_da_pdf_con_ocr(file_path)
-    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
-        testo = estrai_testo_da_immagine(file_path)
     else:
         print(f"Il formato del file {file_path} non è supportato.")
         return

 import shutil
 import os
 import pdfplumber
 # Carica il modello linguistico italiano di spaCy
 nlp = spacy.load('it_core_news_sm')
         print(f"Errore nell'apertura o elaborazione del file PDF: {e}")
     return text.lower()
 def assegna_categoria_con_spacy(testo):
     doc = nlp(testo)
     for token in doc:
     # Determina il tipo di file e estrai il testo appropriatamente
     if file_path.endswith('.pdf'):
         testo = estrai_testo_da_pdf_con_pdfplumber(file_path)
     else:
         print(f"Il formato del file {file_path} non è supportato.")
         return