File size: 3,602 Bytes
d3c4ebb
b9a67fe
 
 
 
 
 
d3c4ebb
b9a67fe
 
 
 
d3c4ebb
4131f20
e1bcd6a
b9a67fe
e1bcd6a
b9a67fe
e1bcd6a
 
b9a67fe
e1bcd6a
b9a67fe
 
 
 
e1bcd6a
b9a67fe
 
 
 
 
 
 
e1bcd6a
 
 
b9a67fe
e1bcd6a
b9a67fe
 
 
 
4131f20
b9a67fe
e1bcd6a
4131f20
b9a67fe
e1bcd6a
0dd8151
f5cfe60
e1bcd6a
b9a67fe
0dd8151
e1bcd6a
b9a67fe
 
 
 
 
 
 
 
e1bcd6a
 
b9a67fe
0dd8151
b9a67fe
e1bcd6a
b9a67fe
0dd8151
b9a67fe
 
 
e1bcd6a
b9a67fe
 
 
 
 
e1bcd6a
b9a67fe
 
 
0dd8151
e1bcd6a
0dd8151
b9a67fe
 
 
e1bcd6a
b9a67fe
 
 
 
 
e1bcd6a
0dd8151
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
from io import BytesIO
from PIL import Image

# Load the model and processor
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

# Define the function for image description (CPU-only)
def describe_image(image, user_question="Solve the problem in the image"):
    try:
        # Convert the PIL Image to a BytesIO object for compatibility
        image_byte_arr = BytesIO()
        image.save(image_byte_arr, format="PNG")  # Save image in PNG format
        image_byte_arr.seek(0)  # Move pointer to the start

        # Define the conversation, using the user's question
        conversation = [
            {
                "role": "User",
                "content": f"<image_placeholder>{user_question}",
                "images": [image_byte_arr]  # Pass the image byte array instead of an object
            },
            {
                "role": "Assistant",
                "content": ""
            }
        ]

        # Convert image byte array back to a PIL image for processing
        pil_images = [Image.open(BytesIO(image_byte_arr.read()))]  # Convert byte back to PIL Image
        image_byte_arr.seek(0)  # Reset the byte stream again for reuse

        # Load images and prepare the inputs
        prepare_inputs = vl_chat_processor(
            conversations=conversation,
            images=pil_images,
            force_batchify=True
        ).to('cpu')  # Move inputs to CPU

        # Load and prepare the model
        vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cpu().eval()  # Move model to CPU

        # Generate embeddings from the image input
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

        # Generate the model's response
        outputs = vl_gpt.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=False,
            use_cache=True
        )

        # Decode the generated tokens into text
        answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
        return answer

    except Exception as e:
        # Provide detailed error information
        return f"Error: {str(e)}"

# Gradio interface
def gradio_app():
    with gr.Blocks() as demo:
        gr.Markdown("# Image Description with DeepSeek VL 1.3b 🐬\n### Upload an image and ask a question about it.")

        with gr.Row():
            image_input = gr.Image(type="pil", label="Upload an Image")
            question_input = gr.Textbox(
                label="Question (optional)", 
                placeholder="Ask a question about the image (e.g., 'What is happening in this image?')",
                lines=2
            )

        output_text = gr.Textbox(label="Image Description", interactive=False)

        submit_btn = gr.Button("Generate Description")

        submit_btn.click(
            fn=describe_image,
            inputs=[image_input, question_input],  # Pass both image and question as inputs
            outputs=output_text
        )

    demo.launch()

# Launch the Gradio app
gradio_app()