Spaces:

Nugh75
/

Edurag_beta

Sleeping

Edurag_beta / app /utils /extract_utils.py

update struttura

3c5ed5b 6 months ago

774 Bytes

	import PyPDF2
	from docx import Document

	def extract_text_from_pdf(file_path):
	"""
	Estrae il testo da un file PDF.

	Args:
	file_path: Percorso del file PDF

	Returns:
	str: Testo estratto dal PDF
	"""
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def extract_text_from_docx(file_path):
	"""
	Estrae il testo da un file DOCX.

	Args:
	file_path: Percorso del file DOCX

	Returns:
	str: Testo estratto dal documento Word
	"""
	doc = Document(file_path)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text