Spaces:

huggingchat
/

document-parser

Running on Zero

document-parser / app.py

Liam Dyer

fix: pandoc

7ca6619 unverified about 1 year ago

2.62 kB

	import gradio as gr
	import spaces
	import subprocess
	import os
	import string
	import random
	from pypdf import PdfReader
	import ocrmypdf


	def random_word(length):
	letters = string.ascii_lowercase
	return "".join(random.choice(letters) for _ in range(length))


	def convert_pdf(input_file):
	reader = PdfReader(input_file)
	metadata = extract_metadata_from_pdf(reader)
	text = extract_text_from_pdf(reader)

	# Check if there are any images
	image_count = 0
	for page in reader.pages:
	image_count += len(page.images)

	# If there are images and not much content, perform OCR on the document
	if image_count > 0 and len(text) < 1000:
	out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
	ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)

	# Re-extract text
	text = extract_text_from_pdf(PdfReader(input_file))

	# Delete the OCR file
	os.remove(out_pdf_file)

	return text, metadata


	def extract_text_from_pdf(reader):
	full_text = ""
	for idx, page in enumerate(reader.pages):
	text = page.extract_text()
	if len(text) > 0:
	full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"

	return full_text.strip()


	def extract_metadata_from_pdf(reader):
	return {
	"author": reader.metadata.author,
	"creator": reader.metadata.creator,
	"producer": reader.metadata.producer,
	"subject": reader.metadata.subject,
	"title": reader.metadata.title,
	}


	def convert_pandoc(input_file):
	# Convert the file to markdown with pandoc
	output_file = f"{random_word(16)}.md"
	result = subprocess.call(
	["pandoc", input_file, "-t", "markdown", "-o", output_file]
	)
	if result != 0:
	raise ValueError("Error converting file to markdown with pandoc")

	# Read the file and delete
	with open(output_file, "r") as f:
	markdown = f.read()
	os.remove(output_file)

	return markdown


	@spaces.GPU
	def convert(input_file):
	plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
	# Already a plain text file that wouldn't benefit from pandoc so return the content
	if any(input_file.endswith(ft) for ft in plain_text_filetypes):
	with open(input_file, "r") as f:
	return f.read(), {}

	if input_file.endswith(".pdf"):
	return convert_pdf(input_file)

	return convert_pandoc(input_file), {}


	gr.Interface(
	convert,
	inputs=gr.File(label="Upload File", type="filepath"),
	outputs=[
	gr.Text(label="Markdown"),
	gr.JSON(label="Metadata"),
	],
	).launch()