|
import os |
|
import re |
|
import subprocess |
|
import numpy as np |
|
from PIL import Image |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoProcessor, AutoModelForCausalLM |
|
|
|
|
|
|
|
model_name = "PJMixers-Images/Florence-2-base-Castollux-v0.5" |
|
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).eval() |
|
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
TITLE = f"# [{model_name}](https://huggingface.co/{model_name})" |
|
|
|
|
|
def process_image(image, num_beams=5, min_p=0.0, top_p=1.0): |
|
""" |
|
Process a single image to generate a caption. |
|
Supports image input as file path, numpy array, or PIL Image. |
|
Generation settings (num_beams, min_p, top_p) can be customized. |
|
""" |
|
try: |
|
|
|
if isinstance(image, np.ndarray): |
|
image = Image.fromarray(image) |
|
elif isinstance(image, str): |
|
image = Image.open(image) |
|
if image.mode != "RGB": |
|
image = image.convert("RGB") |
|
|
|
|
|
inputs = processor( |
|
text="<CAPTION>", |
|
images=image, |
|
return_tensors="pt" |
|
) |
|
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate( |
|
input_ids=inputs["input_ids"], |
|
pixel_values=inputs["pixel_values"], |
|
max_new_tokens=1024, |
|
num_beams=num_beams, |
|
do_sample=True, |
|
top_p=top_p, |
|
min_p=min_p, |
|
) |
|
|
|
|
|
return processor.batch_decode( |
|
generated_ids, |
|
skip_special_tokens=False |
|
)[0].replace('</s>', '').replace('<s>', '').replace('<pad>', '').strip() |
|
|
|
except Exception as e: |
|
return f"Error processing image: {e}" |
|
|
|
|
|
|
|
css = """ |
|
#output { height: 500px; overflow: auto; border: 1px solid #ccc; } |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown(TITLE) |
|
|
|
with gr.Tab(label="Single Image Processing"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_img = gr.Image(label="Input Picture") |
|
|
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Output Text") |
|
|
|
submit_btn = gr.Button(value="Submit") |
|
|
|
num_beams_slider = gr.Slider( |
|
minimum=1, |
|
maximum=5, |
|
step=1, |
|
value=5, |
|
label="Number of Beams" |
|
) |
|
min_p_slider = gr.Slider( |
|
minimum=0, |
|
maximum=1, |
|
step=0.01, |
|
value=0.0, |
|
label="Min-P" |
|
) |
|
top_p_slider = gr.Slider( |
|
minimum=0, |
|
maximum=1, |
|
step=0.01, |
|
value=1.0, |
|
label="Top-P" |
|
) |
|
|
|
gr.Examples( |
|
[ |
|
["eval_img_1.jpg", 5, 0.0, 1.0], |
|
["eval_img_2.jpg", 5, 0.0, 1.0], |
|
["eval_img_3.jpg", 5, 0.0, 1.0], |
|
["eval_img_4.jpg", 5, 0.0, 1.0], |
|
["eval_img_5.jpg", 5, 0.0, 1.0], |
|
["eval_img_6.jpg", 5, 0.0, 1.0], |
|
["eval_img_7.png", 5, 0.0, 1.0], |
|
["eval_img_8.jpg", 5, 0.0, 1.0], |
|
], |
|
inputs=[input_img, num_beams_slider, min_p_slider, top_p_slider], |
|
outputs=[output_text], |
|
fn=process_image, |
|
label="Try captioning on below examples", |
|
) |
|
|
|
submit_btn.click( |
|
process_image, |
|
[input_img, num_beams_slider, min_p_slider, top_p_slider], |
|
[output_text] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|