Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,624 Bytes
dd1cb9c 1f0ed21 6c400a9 efce880 6c400a9 efce880 dd1cb9c 6c400a9 efce880 7ca6619 efce880 dd1cb9c 6c400a9 4013f70 6c400a9 4013f70 6c400a9 dd1cb9c 1cf5a2d 4013f70 dd1cb9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
import spaces
import subprocess
import os
import string
import random
from pypdf import PdfReader
import ocrmypdf
def random_word(length):
letters = string.ascii_lowercase
return "".join(random.choice(letters) for _ in range(length))
def convert_pdf(input_file):
reader = PdfReader(input_file)
metadata = extract_metadata_from_pdf(reader)
text = extract_text_from_pdf(reader)
# Check if there are any images
image_count = 0
for page in reader.pages:
image_count += len(page.images)
# If there are images and not much content, perform OCR on the document
if image_count > 0 and len(text) < 1000:
out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)
# Re-extract text
text = extract_text_from_pdf(PdfReader(input_file))
# Delete the OCR file
os.remove(out_pdf_file)
return text, metadata
def extract_text_from_pdf(reader):
full_text = ""
for idx, page in enumerate(reader.pages):
text = page.extract_text()
if len(text) > 0:
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
return full_text.strip()
def extract_metadata_from_pdf(reader):
return {
"author": reader.metadata.author,
"creator": reader.metadata.creator,
"producer": reader.metadata.producer,
"subject": reader.metadata.subject,
"title": reader.metadata.title,
}
def convert_pandoc(input_file):
# Convert the file to markdown with pandoc
output_file = f"{random_word(16)}.md"
result = subprocess.call(
["pandoc", input_file, "-t", "markdown", "-o", output_file]
)
if result != 0:
raise ValueError("Error converting file to markdown with pandoc")
# Read the file and delete
with open(output_file, "r") as f:
markdown = f.read()
os.remove(output_file)
return markdown
@spaces.GPU
def convert(input_file):
plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
# Already a plain text file that wouldn't benefit from pandoc so return the content
if any(input_file.endswith(ft) for ft in plain_text_filetypes):
with open(input_file, "r") as f:
return f.read(), {}
if input_file.endswith(".pdf"):
return convert_pdf(input_file)
return convert_pandoc(input_file), {}
gr.Interface(
convert,
inputs=gr.File(label="Upload File", type="filepath"),
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()
|