Spaces:

mrcuddle
/

Lumimaid-Pixtral

Runtime error

App Files Files Community

mrcuddle commited on Dec 19, 2024

Commit

bec473c

verified ·

1 Parent(s): 82c21d0

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -5

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 import numpy as np
 import spaces
-# Load the Pixtral model and processor
 model_id = "mrcuddle/lumimaid-v0.2-8b-pixtral"
 processor = AutoProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, ignore_mismatched_sizes=True).to("cuda")
@@ -19,6 +19,9 @@ def generate_text(input_text="", image=None):
     image_np = np.array(image)
     image_pil = Image.fromarray(image_np.astype('uint8'), 'RGB')
     # Use a default prompt if no text is provided
     if not input_text:
         input_text = "Describe the image."
@@ -26,10 +29,10 @@ def generate_text(input_text="", image=None):
     # Prepare inputs
     inputs = processor(text=input_text, images=image_pil, return_tensors="pt").to("cuda")
-    # Debug: Print the keys and shapes of the inputs dictionary
     print("Processor output keys:", inputs.keys())
     for key, value in inputs.items():
-        print(f"{key}: {value.shape}")
     # Check if image tokens are generated
     if 'input_ids' not in inputs or inputs['input_ids'].numel() == 0:
@@ -46,8 +49,8 @@ iface = gr.Interface(
     fn=generate_text,
     inputs=[gr.Textbox(label="Enter your text here (optional)", value=""), gr.Image(label="Upload an image", type="pil")],
     outputs=gr.Textbox(label="Generated Text"),
-    title="Pixtral Model Interaction",
-    description="Interact with the Pixtral model using text and image inputs. If no text is provided, the model will describe the image."
 )
 # Launch the interface

 import numpy as np
 import spaces
+# Load the Llava model and processor
 model_id = "mrcuddle/lumimaid-v0.2-8b-pixtral"
 processor = AutoProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, ignore_mismatched_sizes=True).to("cuda")
     image_np = np.array(image)
     image_pil = Image.fromarray(image_np.astype('uint8'), 'RGB')
+    # Resize the image to the expected resolution (336 x 336)
+    image_pil = image_pil.resize((336, 336))
     # Use a default prompt if no text is provided
     if not input_text:
         input_text = "Describe the image."
     # Prepare inputs
     inputs = processor(text=input_text, images=image_pil, return_tensors="pt").to("cuda")
+    # Debug: Print the keys and types of the inputs dictionary
     print("Processor output keys:", inputs.keys())
     for key, value in inputs.items():
+        print(f"{key}: {type(value)}")
     # Check if image tokens are generated
     if 'input_ids' not in inputs or inputs['input_ids'].numel() == 0:
     fn=generate_text,
     inputs=[gr.Textbox(label="Enter your text here (optional)", value=""), gr.Image(label="Upload an image", type="pil")],
     outputs=gr.Textbox(label="Generated Text"),
+    title="Llava Model Interaction",
+    description="Interact with the Llava model using text and image inputs. If no text is provided, the model will describe the image."
 )
 # Launch the interface