HenryShan commited on
Commit
0dd8151
·
verified ·
1 Parent(s): f5cfe60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -45
app.py CHANGED
@@ -11,20 +11,20 @@ model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
11
  vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
12
  tokenizer = vl_chat_processor.tokenizer
13
 
14
-
15
- def describe_image(image, user_question="You are the best AP teacher in the world. Analyze the AP problem in the image, and solve it step by step to let a student who don't know how to solve it understand"):
16
  try:
17
- # Convert the PIL Image to a BytesIO object for compatibility
18
  image_byte_arr = BytesIO()
19
- image.save(image_byte_arr, format="PNG") # Save image in PNG format
20
- image_byte_arr.seek(0) # Move pointer to the start
21
 
22
- # Define the conversation, using the user's question
23
  conversation = [
24
  {
25
  "role": "User",
26
  "content": f"<image_placeholder>{user_question}",
27
- "images": [image_byte_arr] # Pass the image byte array instead of an object
28
  },
29
  {
30
  "role": "Assistant",
@@ -32,42 +32,30 @@ def describe_image(image, user_question="You are the best AP teacher in the worl
32
  }
33
  ]
34
 
35
- # Convert image byte array back to a PIL image for processing
36
- pil_images = [Image.open(BytesIO(image_byte_arr.read()))] # Convert byte back to PIL Image
37
- image_byte_arr.seek(0) # Reset the byte stream again for reuse
38
 
39
- # Load images and prepare the inputs
40
  prepare_inputs = vl_chat_processor(
41
  conversations=conversation,
42
  images=pil_images,
43
  force_batchify=True
44
- )
45
-
46
- # Explicitly cast all tensors in prepare_inputs to torch.float16
47
- prepare_inputs = {
48
- k: v.to(torch.float16) if isinstance(v, torch.Tensor) else v
49
- for k, v in prepare_inputs.items()
50
- }
51
-
52
- # Load and prepare the model
53
- vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.float16).eval()
54
- vl_gpt = vl_gpt.to(torch.float16) # Explicitly ensure all components are in float16
55
 
56
- # Generate embeddings from the image input
57
- inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs).to(dtype=torch.float16)
 
 
 
58
 
59
- # Ensure attention mask is also in torch.float16
60
- attention_mask = prepare_inputs["attention_mask"].to(vl_gpt.device).to(dtype=torch.float16)
61
 
62
- # Debugging: Print tensor dtypes
63
- print(f"Inputs Embeds dtype: {inputs_embeds.dtype}")
64
- print(f"Attention Mask dtype: {attention_mask.dtype}")
65
- print(f"Model dtype: {next(vl_gpt.parameters()).dtype}")
66
-
67
- # Generate the model's response
68
  outputs = vl_gpt.language_model.generate(
69
- inputs_embeds=inputs_embeds.to(torch.float16),
70
- attention_mask=attention_mask.to(torch.float16),
71
  pad_token_id=tokenizer.eos_token_id,
72
  bos_token_id=tokenizer.bos_token_id,
73
  eos_token_id=tokenizer.eos_token_id,
@@ -76,36 +64,36 @@ def describe_image(image, user_question="You are the best AP teacher in the worl
76
  use_cache=True
77
  )
78
 
79
- # Decode the generated tokens into text
80
- answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
81
  return answer
 
82
  except Exception as e:
83
- # Provide detailed error information
84
  return f"Error: {str(e)}"
 
85
  # Gradio interface
86
  def gradio_app():
87
  with gr.Blocks() as demo:
88
- gr.Markdown("# Image Description with DeepSeek VL 1.3b 🐬\n### Upload an image and ask a question about it.")
89
 
90
  with gr.Row():
91
  image_input = gr.Image(type="pil", label="Upload an Image")
92
  question_input = gr.Textbox(
93
  label="Question (optional)",
94
- placeholder="Ask a question about the image (e.g., 'What is happening in this image?')",
95
  lines=2
96
  )
97
 
98
- output_text = gr.Textbox(label="Solving the problem", interactive=False)
99
-
100
- submit_btn = gr.Button("Solve")
101
 
102
  submit_btn.click(
103
  fn=describe_image,
104
- inputs=[image_input, question_input], # Pass both image and question as inputs
105
  outputs=output_text
106
  )
107
 
108
  demo.launch()
109
 
110
- # Launch the Gradio app
111
- gradio_app()
 
11
  vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
12
  tokenizer = vl_chat_processor.tokenizer
13
 
14
+ # Define the function for image description (CPU version)
15
+ def describe_image(image, user_question="Describe this image in great detail."):
16
  try:
17
+ # Convert the PIL Image to a BytesIO object
18
  image_byte_arr = BytesIO()
19
+ image.save(image_byte_arr, format="PNG")
20
+ image_byte_arr.seek(0)
21
 
22
+ # Define the conversation
23
  conversation = [
24
  {
25
  "role": "User",
26
  "content": f"<image_placeholder>{user_question}",
27
+ "images": [image_byte_arr]
28
  },
29
  {
30
  "role": "Assistant",
 
32
  }
33
  ]
34
 
35
+ # Convert byte array back to PIL image
36
+ pil_images = [Image.open(BytesIO(image_byte_arr.read()))]
37
+ image_byte_arr.seek(0)
38
 
39
+ # Prepare inputs with CPU and float32 type
40
  prepare_inputs = vl_chat_processor(
41
  conversations=conversation,
42
  images=pil_images,
43
  force_batchify=True
44
+ ).to(torch.float32) # Convert to float32 for CPU compatibility
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Load model with CPU and float32 weights
47
+ vl_gpt = AutoModelForCausalLM.from_pretrained(
48
+ model_path,
49
+ trust_remote_code=True
50
+ ).float().eval() # Convert all weights to float32
51
 
52
+ # Generate embeddings with CPU
53
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
54
 
55
+ # Generate response with CPU
 
 
 
 
 
56
  outputs = vl_gpt.language_model.generate(
57
+ inputs_embeds=inputs_embeds,
58
+ attention_mask=prepare_inputs.attention_mask,
59
  pad_token_id=tokenizer.eos_token_id,
60
  bos_token_id=tokenizer.bos_token_id,
61
  eos_token_id=tokenizer.eos_token_id,
 
64
  use_cache=True
65
  )
66
 
67
+ # Decode the response
68
+ answer = tokenizer.decode(outputs[0].tolist(), skip_special_tokens=True)
69
  return answer
70
+
71
  except Exception as e:
 
72
  return f"Error: {str(e)}"
73
+
74
  # Gradio interface
75
  def gradio_app():
76
  with gr.Blocks() as demo:
77
+ gr.Markdown("# Image Description with DeepSeek VL 1.3b 🐬 (CPU Version)")
78
 
79
  with gr.Row():
80
  image_input = gr.Image(type="pil", label="Upload an Image")
81
  question_input = gr.Textbox(
82
  label="Question (optional)",
83
+ placeholder="Ask a question about the image",
84
  lines=2
85
  )
86
 
87
+ output_text = gr.Textbox(label="Image Description", interactive=False)
88
+ submit_btn = gr.Button("Generate Description")
 
89
 
90
  submit_btn.click(
91
  fn=describe_image,
92
+ inputs=[image_input, question_input],
93
  outputs=output_text
94
  )
95
 
96
  demo.launch()
97
 
98
+ # Launch the app
99
+ gradio_app()