Spaces:
Runtime error
Runtime error
from pathlib import Path | |
import pypdf | |
import docx2txt | |
class DocumentReader: | |
def read_pdf(data_path): | |
with open(data_path, "rb") as fp: | |
pdf = pypdf.PdfReader(fp) # Open the PDF file | |
num_pages = len(pdf.pages) # Get the number of pages in the PDF | |
docs = [] | |
for page in range(num_pages): | |
page_text = pdf.pages[page].extract_text() # Extract text from the page | |
page_label = pdf.page_labels[page] # Get page label (e.g., page number) | |
metadata = {"page_label": page_label, "file_name": data_path.name} | |
docs.append({"text": page_text, "metadata": metadata}) | |
return docs | |
def read_docx(data_path): | |
metadata = {"file_name": data_path.name} | |
doc = docx2txt.process(data_path) # Extract text from the DOCX file | |
docs = [{'text': doc, 'metadata': metadata}] | |
return docs | |
def read_txt(data_path): | |
print(data_path.name) | |
with open(data_path, "r") as fp: | |
text = fp.read() # Read text from the TXT file | |
metadata = {"file_name": data_path.name} | |
docs = [{'text': text, 'metadata': metadata}] | |
return docs | |
def read_document(file_path): | |
data_path = Path(file_path) | |
if data_path.suffix == ".pdf": | |
return DocumentReader.read_pdf(data_path) # Read PDF document | |
elif data_path.suffix == ".docx": | |
return DocumentReader.read_docx(data_path) # Read DOCX document | |
elif data_path.suffix == ".txt": | |
return DocumentReader.read_txt(data_path) # Read TXT document | |
else: | |
raise ValueError("Unsupported file format") | |
if __name__=='__main__': | |
# Example usage: | |
DATA_PATH = '71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf' | |
documents = DocumentReader.read_document(DATA_PATH) # Read the specified document | |
print(documents) # Print the extracted text and metadata | |