|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
import spaces |
|
import torch |
|
import os |
|
from huggingface_hub import login |
|
from PIL import Image |
|
from threading import Thread |
|
import platform |
|
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer |
|
import time |
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
""" |
|
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference |
|
""" |
|
|
|
print(f"Is CUDA available: {torch.cuda.is_available()}") |
|
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") |
|
print(f"CUDA version: {torch.version.cuda}") |
|
print(f"Python version: {platform.python_version()}") |
|
print(f"Pytorch version: {torch.__version__}") |
|
print(f"Gradio version: {gr. __version__}") |
|
duration=10 |
|
|
|
login(token = os.getenv('gemma')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ckpt = "google/gemma-3-4b-it" |
|
model = Gemma3ForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda") |
|
processor = AutoProcessor.from_pretrained(ckpt) |
|
|
|
@spaces.GPU(duration=duration) |
|
def bot_streaming(message, history, max_new_tokens=250): |
|
|
|
txt = message["text"] |
|
ext_buffer = f"{txt}" |
|
|
|
messages= [] |
|
images = [] |
|
|
|
|
|
for i, msg in enumerate(history): |
|
if isinstance(msg[0], tuple): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) |
|
images.append(Image.open(msg[0][0]).convert("RGB")) |
|
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): |
|
|
|
pass |
|
elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) |
|
|
|
|
|
if len(message["files"]) == 1: |
|
|
|
if isinstance(message["files"][0], str): |
|
image = Image.open(message["files"][0]).convert("RGB") |
|
else: |
|
image = Image.open(message["files"][0]["path"]).convert("RGB") |
|
images.append(image) |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) |
|
else: |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) |
|
|
|
|
|
texts = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
|
|
if images == []: |
|
inputs = processor(text=texts, return_tensors="pt").to("cuda") |
|
else: |
|
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda") |
|
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True) |
|
|
|
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) |
|
generated_text = "" |
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
thread.start() |
|
buffer = "" |
|
|
|
for new_text in streamer: |
|
buffer += new_text |
|
generated_text_without_prompt = buffer |
|
time.sleep(0.01) |
|
yield buffer |
|
|
|
|
|
demo = gr.ChatInterface(fn=bot_streaming, |
|
title="Multimodal Gemma 3 Model by Google", |
|
textbox=gr.MultimodalTextbox(), |
|
additional_inputs = [gr.Slider( |
|
minimum=10, |
|
maximum=500, |
|
value=250, |
|
step=10, |
|
label="Maximum number of new tokens to generate", |
|
) |
|
], |
|
cache_examples=False, |
|
description="Upload an image, and start chatting about it, or just enter any text into the prompt to start.", |
|
stop_btn="Stop Generation", |
|
fill_height=True, |
|
multimodal=True) |
|
|
|
demo.launch(debug=True) |
|
|
|
|