sflindrs's picture
Update app.py
4bf5dae verified
raw
history blame
4.06 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import spaces
import json
# Load the processor and model
processor = AutoProcessor.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
import json
def wrap_json_in_markdown(text):
result = []
stack = []
json_start = None
in_json = False
i = 0
while i < len(text):
char = text[i]
if char in ['{', '[']:
if not in_json:
json_start = i
in_json = True
stack.append(char)
else:
stack.append(char)
elif char in ['}', ']'] and in_json:
if not stack:
# Unbalanced bracket, reset
in_json = False
json_start = None
else:
last = stack.pop()
if (last == '{' and char != '}') or (last == '[' and char != ']'):
# Mismatched brackets
in_json = False
json_start = None
if in_json and not stack:
# Potential end of JSON
json_str = text[json_start:i+1]
try:
# Try to parse the JSON to ensure it's valid
parsed = json.loads(json_str)
# Wrap in Markdown code block
wrapped = f"\n```json\n{json.dumps(parsed, indent=4)}\n```\n"
result.append(text[:json_start]) # Append text before JSON
result.append(wrapped) # Append wrapped JSON
text = text[i+1:] # Update the remaining text
i = -1 # Reset index
except json.JSONDecodeError:
# Not valid JSON, continue searching
pass
in_json = False
json_start = None
i += 1
result.append(text) # Append any remaining text
return ''.join(result)
@spaces.GPU()
def process_image_and_text(image, text):
# Process the image and text
inputs = processor.process(
images=[Image.fromarray(image)],
text=text
)
# Move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
# Generate output
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer
)
# Only get generated tokens; decode them to text
generated_tokens = output[0, inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
return generated_text_w_json_wrapper
def chatbot(image, text, history):
if image is None:
return history + [("Please upload an image first.", None)]
response = process_image_and_text(image, text)
history.append({"role": "user", "content": text})
history.append({"role": "assistant", "content": response})
return history
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Image Chatbot with Molmo-7B-D-0924")
with gr.Row():
image_input = gr.Image(type="numpy")
chatbot_output = gr.Chatbot(type="messages")
text_input = gr.Textbox(placeholder="Ask a question about the image...")
submit_button = gr.Button("Submit")
state = gr.State([])
submit_button.click(
chatbot,
inputs=[image_input, text_input, state],
outputs=[chatbot_output]
)
text_input.submit(
chatbot,
inputs=[image_input, text_input, state],
outputs=[chatbot_output]
)
demo.launch()