Spaces:

huggingchat
/

document-parser

Running on Zero

File size: 2,624 Bytes

dd1cb9c
1f0ed21
6c400a9
 
efce880
 
6c400a9
 
efce880
 
 
 
 
dd1cb9c
 
6c400a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efce880
 
7ca6619
 
 
 
 
efce880
 
 
 
 
 
 
dd1cb9c
 
6c400a9
 
 
 
 
 
4013f70
6c400a9
 
 
 
4013f70
6c400a9
 
dd1cb9c
1cf5a2d
 
4013f70
 
 
 
dd1cb9c

import gradio as gr
import spaces
import subprocess
import os
import string
import random
from pypdf import PdfReader
import ocrmypdf


def random_word(length):
    letters = string.ascii_lowercase
    return "".join(random.choice(letters) for _ in range(length))


def convert_pdf(input_file):
    reader = PdfReader(input_file)
    metadata = extract_metadata_from_pdf(reader)
    text = extract_text_from_pdf(reader)

    # Check if there are any images
    image_count = 0
    for page in reader.pages:
        image_count += len(page.images)

    # If there are images and not much content, perform OCR on the document
    if image_count > 0 and len(text) < 1000:
        out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
        ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)

        # Re-extract text
        text = extract_text_from_pdf(PdfReader(input_file))

        # Delete the OCR file
        os.remove(out_pdf_file)

    return text, metadata


def extract_text_from_pdf(reader):
    full_text = ""
    for idx, page in enumerate(reader.pages):
        text = page.extract_text()
        if len(text) > 0:
            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"

    return full_text.strip()


def extract_metadata_from_pdf(reader):
    return {
        "author": reader.metadata.author,
        "creator": reader.metadata.creator,
        "producer": reader.metadata.producer,
        "subject": reader.metadata.subject,
        "title": reader.metadata.title,
    }


def convert_pandoc(input_file):
    # Convert the file to markdown with pandoc
    output_file = f"{random_word(16)}.md"
    result = subprocess.call(
        ["pandoc", input_file, "-t", "markdown", "-o", output_file]
    )
    if result != 0:
        raise ValueError("Error converting file to markdown with pandoc")

    # Read the file and delete
    with open(output_file, "r") as f:
        markdown = f.read()
    os.remove(output_file)

    return markdown


@spaces.GPU
def convert(input_file):
    plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
    # Already a plain text file that wouldn't benefit from pandoc so return the content
    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
        with open(input_file, "r") as f:
            return f.read(), {}

    if input_file.endswith(".pdf"):
        return convert_pdf(input_file)

    return convert_pandoc(input_file), {}


gr.Interface(
    convert,
    inputs=gr.File(label="Upload File", type="filepath"),
    outputs=[
        gr.Text(label="Markdown"),
        gr.JSON(label="Metadata"),
    ],
).launch()