fun-image-caption

Sleeping

App Files Files Community

Dylan commited on Mar 23

Commit

a4690cb

1 Parent(s): 98efca2

added description agents -- dummy

Browse files

Files changed (4) hide show

agents.py +146 -0
app.backup.py +9 -0
app.py +59 -4
helpers.py +9 -0

agents.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+from langgraph.graph import END, StateGraph
+from typing import TypedDict, Any
+from transformers import (
+    AutoProcessor,
+    BitsAndBytesConfig,
+    Gemma3ForConditionalGeneration,
+)
+def get_quantization_config():
+    return BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+# Define the state schema
+class State(TypedDict):
+    image: Any
+    voice: str
+    caption: str
+    description: str
+# Build the workflow graph
+def build_graph():
+    workflow = StateGraph(State)
+    # Add nodes
+    workflow.add_node("caption_image", caption_image)
+    workflow.add_node("describe_with_voice", describe_with_voice)
+    # Add edges
+    workflow.set_entry_point("caption_image")
+    workflow.add_edge("caption_image", "describe_with_voice")
+    workflow.add_edge("describe_with_voice", END)
+    # Compile the graph
+    return workflow.compile()
+model_id = "google/gemma-3-4b-it"
+# Initialize processor and model
+processor = AutoProcessor.from_pretrained(model_id)
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id,
+    # quantization_config=get_quantization_config(),
+    device_map="auto",
+    torch_dtype=torch.float16,
+)
+def describe_with_voice(state: State) -> State:
+    state["description"] = "Dummy description"
+    return state
+def caption_image(state: State) -> State:
+    state["caption"] = "Dummy caption"
+def describe_with_voice2(state: State) -> State:
+    caption = state["caption"]
+    voice = state["voice"]
+    # Voice prompt templates
+    voice_prompts = {
+        "scurvy-ridden pirate": "You are a scurvy-ridden pirate, angry and drunk.",
+        "forgetful wizard": "You are a forgetful and easily distracted wizard.",
+        "sarcastic teenager": "You are a sarcastic and disinterested teenager.",
+    }
+    messages = [
+        {"role": "system", "content": [voice_prompts.get(voice)]},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"Describe the following:\n\n{caption}"}
+            ],
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(model.device, dtype=torch.bfloat16)
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        generation = generation[0][input_len:]
+    description = processor.decode(generation, skip_special_tokens=True)
+    state["description"] = description
+    return state
+def caption_image2(state: State) -> State:
+    # image is PIL
+    image = state["image"]
+    # Load models (in practice, do this once and cache)
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant that will describe images in 3-5 sentences.",
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(model.device, dtype=torch.bfloat16)
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        generation = generation[0][input_len:]
+    caption = processor.decode(generation, skip_special_tokens=True)
+    state["caption"] = caption
+    return state

app.backup.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()

app.py CHANGED Viewed

@@ -1,7 +1,62 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from agents import build_graph
+# Initialize the graph
+graph = build_graph()
+def process_and_display(image, voice):
+    # Initialize state
+    state = {"image": image, "voice": voice, "caption": "", "description": ""}
+    # Run the graph
+    result = graph.invoke(state)
+    # Return the caption and description
+    return result["caption"], result["description"]
+def create_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Image Description with Voice Personas")
+        gr.Markdown("""
+        This app takes an image and generates a description using a selected voice persona.
+        1. Upload an image
+        2. Select a voice persona from the dropdown
+        3. Click "Generate Description" to see the results
+        """)
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(type="pil", label="Upload an Image")
+                voice_dropdown = gr.Dropdown(
+                    choices=[
+                        "scurvy-ridden pirate",
+                        "forgetful wizard",
+                        "sarcastic teenager",
+                    ],
+                    label="Select a Voice",
+                    value="scurvy-ridden pirate",
+                )
+                submit_button = gr.Button("Generate Description")
+            with gr.Column():
+                caption_output = gr.Textbox(label="Image Caption")
+                description_output = gr.Textbox(label="Voice Description")
+        submit_button.click(
+            fn=process_and_display,
+            inputs=[image_input, voice_dropdown],
+            outputs=[caption_output, description_output],
+        )
+    return demo
+# Launch the app
+demo = create_interface()
+if __name__ == "__main__":
+    demo.launch()

helpers.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import base64
+import io
+def image_to_base64(image):
+    """Convert PIL Image to base64 encoded string"""
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="JPEG")
+    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")