Edurag_beta / app /utils /extract_utils.py
Nugh75's picture
update struttura
3c5ed5b
raw
history blame
774 Bytes
import PyPDF2
from docx import Document
def extract_text_from_pdf(file_path):
"""
Estrae il testo da un file PDF.
Args:
file_path: Percorso del file PDF
Returns:
str: Testo estratto dal PDF
"""
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def extract_text_from_docx(file_path):
"""
Estrae il testo da un file DOCX.
Args:
file_path: Percorso del file DOCX
Returns:
str: Testo estratto dal documento Word
"""
doc = Document(file_path)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text