File size: 3,602 Bytes
d3c4ebb b9a67fe d3c4ebb b9a67fe d3c4ebb 4131f20 e1bcd6a b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe 4131f20 b9a67fe e1bcd6a 4131f20 b9a67fe e1bcd6a 0dd8151 f5cfe60 e1bcd6a b9a67fe 0dd8151 e1bcd6a b9a67fe e1bcd6a b9a67fe 0dd8151 b9a67fe e1bcd6a b9a67fe 0dd8151 b9a67fe e1bcd6a b9a67fe e1bcd6a b9a67fe 0dd8151 e1bcd6a 0dd8151 b9a67fe e1bcd6a b9a67fe e1bcd6a 0dd8151 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
from io import BytesIO
from PIL import Image
# Load the model and processor
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
# Define the function for image description (CPU-only)
def describe_image(image, user_question="Solve the problem in the image"):
try:
# Convert the PIL Image to a BytesIO object for compatibility
image_byte_arr = BytesIO()
image.save(image_byte_arr, format="PNG") # Save image in PNG format
image_byte_arr.seek(0) # Move pointer to the start
# Define the conversation, using the user's question
conversation = [
{
"role": "User",
"content": f"<image_placeholder>{user_question}",
"images": [image_byte_arr] # Pass the image byte array instead of an object
},
{
"role": "Assistant",
"content": ""
}
]
# Convert image byte array back to a PIL image for processing
pil_images = [Image.open(BytesIO(image_byte_arr.read()))] # Convert byte back to PIL Image
image_byte_arr.seek(0) # Reset the byte stream again for reuse
# Load images and prepare the inputs
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to('cpu') # Move inputs to CPU
# Load and prepare the model
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cpu().eval() # Move model to CPU
# Generate embeddings from the image input
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# Generate the model's response
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False,
use_cache=True
)
# Decode the generated tokens into text
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
except Exception as e:
# Provide detailed error information
return f"Error: {str(e)}"
# Gradio interface
def gradio_app():
with gr.Blocks() as demo:
gr.Markdown("# Image Description with DeepSeek VL 1.3b 🐬\n### Upload an image and ask a question about it.")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an Image")
question_input = gr.Textbox(
label="Question (optional)",
placeholder="Ask a question about the image (e.g., 'What is happening in this image?')",
lines=2
)
output_text = gr.Textbox(label="Image Description", interactive=False)
submit_btn = gr.Button("Generate Description")
submit_btn.click(
fn=describe_image,
inputs=[image_input, question_input], # Pass both image and question as inputs
outputs=output_text
)
demo.launch()
# Launch the Gradio app
gradio_app() |