mrcuddle commited on
Commit
bec473c
Β·
verified Β·
1 Parent(s): 82c21d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -5,7 +5,7 @@ import torch
5
  import numpy as np
6
  import spaces
7
 
8
- # Load the Pixtral model and processor
9
  model_id = "mrcuddle/lumimaid-v0.2-8b-pixtral"
10
  processor = AutoProcessor.from_pretrained(model_id)
11
  model = LlavaForConditionalGeneration.from_pretrained(model_id, ignore_mismatched_sizes=True).to("cuda")
@@ -19,6 +19,9 @@ def generate_text(input_text="", image=None):
19
  image_np = np.array(image)
20
  image_pil = Image.fromarray(image_np.astype('uint8'), 'RGB')
21
 
 
 
 
22
  # Use a default prompt if no text is provided
23
  if not input_text:
24
  input_text = "Describe the image."
@@ -26,10 +29,10 @@ def generate_text(input_text="", image=None):
26
  # Prepare inputs
27
  inputs = processor(text=input_text, images=image_pil, return_tensors="pt").to("cuda")
28
 
29
- # Debug: Print the keys and shapes of the inputs dictionary
30
  print("Processor output keys:", inputs.keys())
31
  for key, value in inputs.items():
32
- print(f"{key}: {value.shape}")
33
 
34
  # Check if image tokens are generated
35
  if 'input_ids' not in inputs or inputs['input_ids'].numel() == 0:
@@ -46,8 +49,8 @@ iface = gr.Interface(
46
  fn=generate_text,
47
  inputs=[gr.Textbox(label="Enter your text here (optional)", value=""), gr.Image(label="Upload an image", type="pil")],
48
  outputs=gr.Textbox(label="Generated Text"),
49
- title="Pixtral Model Interaction",
50
- description="Interact with the Pixtral model using text and image inputs. If no text is provided, the model will describe the image."
51
  )
52
 
53
  # Launch the interface
 
5
  import numpy as np
6
  import spaces
7
 
8
+ # Load the Llava model and processor
9
  model_id = "mrcuddle/lumimaid-v0.2-8b-pixtral"
10
  processor = AutoProcessor.from_pretrained(model_id)
11
  model = LlavaForConditionalGeneration.from_pretrained(model_id, ignore_mismatched_sizes=True).to("cuda")
 
19
  image_np = np.array(image)
20
  image_pil = Image.fromarray(image_np.astype('uint8'), 'RGB')
21
 
22
+ # Resize the image to the expected resolution (336 x 336)
23
+ image_pil = image_pil.resize((336, 336))
24
+
25
  # Use a default prompt if no text is provided
26
  if not input_text:
27
  input_text = "Describe the image."
 
29
  # Prepare inputs
30
  inputs = processor(text=input_text, images=image_pil, return_tensors="pt").to("cuda")
31
 
32
+ # Debug: Print the keys and types of the inputs dictionary
33
  print("Processor output keys:", inputs.keys())
34
  for key, value in inputs.items():
35
+ print(f"{key}: {type(value)}")
36
 
37
  # Check if image tokens are generated
38
  if 'input_ids' not in inputs or inputs['input_ids'].numel() == 0:
 
49
  fn=generate_text,
50
  inputs=[gr.Textbox(label="Enter your text here (optional)", value=""), gr.Image(label="Upload an image", type="pil")],
51
  outputs=gr.Textbox(label="Generated Text"),
52
+ title="Llava Model Interaction",
53
+ description="Interact with the Llava model using text and image inputs. If no text is provided, the model will describe the image."
54
  )
55
 
56
  # Launch the interface