Spaces:
Running
Running
from huggingface_hub import hf_hub_download | |
import re | |
from PIL import Image | |
import gradio as gr | |
from transformers import NougatProcessor, VisionEncoderDecoderModel | |
from datasets import load_dataset | |
import torch | |
model_checkpoint = "facebook/nougat-base" | |
processor = NougatProcessor.from_pretrained(model_checkpoint) | |
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint) | |
# Use GPU if possible | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
# prepare PDF image for the model | |
def predict(img): | |
pixel_values = processor(img, return_tensors="pt").pixel_values | |
outputs = model.generate( | |
pixel_values.to(device), | |
min_length=1, | |
max_new_tokens=30, | |
bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
) | |
sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
sequence = processor.post_process_generation(sequence, fix_markdown=False) | |
return sequence | |
image = gr.Image() | |
text = ["text"] | |
examples = ['page_10.jpg'] | |
intf = gr.Interface(fn=predict, inputs=image, outputs=text, examples=examples) | |
intf.launch(inline=False) | |