Spaces:

sunil448832
/

retrieval-augment-generation

Runtime error

App Files Files Community

retrieval-augment-generation / data_processor /document_reader.py

sunil448832

Initial Commit

eccde2c over 1 year ago

raw

history blame

2.06 kB

	from pathlib import Path
	import pypdf
	import docx2txt

	class DocumentReader:
	@staticmethod
	def read_pdf(data_path):
	with open(data_path, "rb") as fp:
	pdf = pypdf.PdfReader(fp) # Open the PDF file
	num_pages = len(pdf.pages) # Get the number of pages in the PDF
	docs = []
	for page in range(num_pages):
	page_text = pdf.pages[page].extract_text() # Extract text from the page
	page_label = pdf.page_labels[page] # Get page label (e.g., page number)
	metadata = {"page_label": page_label, "file_name": data_path.name}
	docs.append({"text": page_text, "metadata": metadata})
	return docs

	@staticmethod
	def read_docx(data_path):
	metadata = {"file_name": data_path.name}
	doc = docx2txt.process(data_path) # Extract text from the DOCX file
	docs = [{'text': doc, 'metadata': metadata}]
	return docs

	@staticmethod
	def read_txt(data_path):
	print(data_path.name)
	with open(data_path, "r") as fp:
	text = fp.read() # Read text from the TXT file
	metadata = {"file_name": data_path.name}
	docs = [{'text': text, 'metadata': metadata}]
	return docs

	@staticmethod
	def read_document(file_path):
	data_path = Path(file_path)
	if data_path.suffix == ".pdf":
	return DocumentReader.read_pdf(data_path) # Read PDF document
	elif data_path.suffix == ".docx":
	return DocumentReader.read_docx(data_path) # Read DOCX document
	elif data_path.suffix == ".txt":
	return DocumentReader.read_txt(data_path) # Read TXT document
	else:
	raise ValueError("Unsupported file format")

	if __name__=='__main__':
	# Example usage:
	DATA_PATH = '71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf'
	documents = DocumentReader.read_document(DATA_PATH) # Read the specified document
	print(documents) # Print the extracted text and metadata