import gradio as gr import torch from PIL import Image import requests from transformers import AutoProcessor from modeling_florence2 import Florence2ForConditionalGeneration from configuration_florence2 import Florence2Config # Initialize model and processor device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = Florence2ForConditionalGeneration.from_pretrained("PleIAs/Florence-PDF", torch_dtype=torch_dtype, trust_remote_code=True).to(device) processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True) # Define task prompts TASK_PROMPTS = { "Caption": "", "Detailed Caption": "", "More Detailed Caption": "", "Object Detection": "", "Dense Region Caption": "", "OCR": "", "OCR with Region": "", "Region Proposal": "" } def process_image(image, task): prompt = TASK_PROMPTS[task] inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype) generated_ids = model.generate( **inputs, max_new_tokens=1024, num_beams=3, do_sample=False ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height)) return str(parsed_answer) # Define Gradio interface iface = gr.Interface( fn=process_image, inputs=[ gr.Image(type="pil"), gr.Dropdown(list(TASK_PROMPTS.keys()), label="Task") ], outputs=gr.Textbox(label="Result"), title="Florence-2 Demo", description="Upload an image and select a task to process with Florence-2." ) # Launch the interface iface.launch()