fun-image-caption

Sleeping

App Files Files Community

Dylan commited on Mar 24

Commit

598dcfa

1 Parent(s): 0160c44

added parallel map to call model multiple times

Browse files

Files changed (2) hide show

agents.py +20 -12
app.py +6 -1

agents.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import torch
 from langgraph.graph import END, StateGraph
-from typing import TypedDict, Any
 from transformers import (
     AutoProcessor,
@@ -23,23 +26,21 @@ class State(TypedDict):
     image: Any
     voice: str
     caption: str
-    description: str
 # Build the workflow graph
 def build_graph():
     workflow = StateGraph(State)
-    # Add nodes
-    # workflow.add_node("caption_image", caption_image_dummy)
-    # workflow.add_node("describe_with_voice", describe_with_voice_dummy)
     workflow.add_node("caption_image", caption_image)
     workflow.add_node("describe_with_voice", describe_with_voice)
     # Add edges
     workflow.set_entry_point("caption_image")
-    workflow.add_edge("caption_image", "describe_with_voice")
     workflow.add_edge("describe_with_voice", END)
     # Compile the graph
@@ -76,18 +77,19 @@ def describe_with_voice(state: State) -> State:
     caption = state["caption"]
     voice = state["voice"]
-    caption = "A golden retriever that seems to be smiling straight to the camera"
     # Voice prompt templates
     voice_prompts = {
         "scurvy-ridden pirate": "You are a scurvy-ridden pirate, angry and drunk.",
         "forgetful wizard": "You are a forgetful and easily distracted wizard.",
         "sarcastic teenager": "You are a sarcastic and disinterested teenager.",
     }
     messages = [
         {
             "role": "system",
-            "content": [{"type": "text", "text": voice_prompts.get(voice)}],
         },
         {
             "role": "user",
@@ -111,16 +113,22 @@ def describe_with_voice(state: State) -> State:
     description = processor.decode(generation, skip_special_tokens=True)
-    state["description"] = description
     print(description)
     return state
 def caption_image(state: State) -> State:
     # image is PIL
     image = state["image"]
-    image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
     # Load models (in practice, do this once and cache)
     messages = [

+import operator
+from helpers import image_to_base64
 import torch
 from langgraph.graph import END, StateGraph
+from langgraph.types import Send
+from typing import Annotated, TypedDict, Any
 from transformers import (
     AutoProcessor,
     image: Any
     voice: str
     caption: str
+    descriptions: Annotated[list, operator.add]
 # Build the workflow graph
 def build_graph():
     workflow = StateGraph(State)
     workflow.add_node("caption_image", caption_image)
     workflow.add_node("describe_with_voice", describe_with_voice)
     # Add edges
     workflow.set_entry_point("caption_image")
+    workflow.add_conditional_edges("caption_image", map_describe, ["describe_with_voice"])
+    # workflow.add_edge("caption_image", "describe_with_voice")
     workflow.add_edge("describe_with_voice", END)
     # Compile the graph
     caption = state["caption"]
     voice = state["voice"]
     # Voice prompt templates
     voice_prompts = {
         "scurvy-ridden pirate": "You are a scurvy-ridden pirate, angry and drunk.",
         "forgetful wizard": "You are a forgetful and easily distracted wizard.",
         "sarcastic teenager": "You are a sarcastic and disinterested teenager.",
+        "private investigator": "You are a Victorian-age detective. Suave and intellectual.",
+        "shakespearian": "Talk like one of Shakespeare's characters. ",
     }
+    system_prompt = voice_prompts.get(voice, "You are a pirate.") + " Output 5-10 sentences."
     messages = [
         {
             "role": "system",
+            "content": [{"type": "text", "text": system_prompt}],
         },
         {
             "role": "user",
     description = processor.decode(generation, skip_special_tokens=True)
+    # note that the return value is a list
+    state["description"] = [description]
     print(description)
     return state
+def map_describe(state: State) -> list:
+    # return list of `Send ` objects (3)
+    return [Send("describe_with_voice", {"caption" : state["caption"], "voice": state["voice"]})] * 3
 def caption_image(state: State) -> State:
     # image is PIL
     image = state["image"]
+    image = image_to_base64(image)
     # Load models (in practice, do this once and cache)
     messages = [

app.py CHANGED Viewed

@@ -15,8 +15,11 @@ def process_and_display(image, voice):
     # Run the graph
     result = graph.invoke(state)
     # Return the caption and description
-    return result["caption"], result["description"]
 def create_interface():
@@ -38,6 +41,8 @@ def create_interface():
                         "scurvy-ridden pirate",
                         "forgetful wizard",
                         "sarcastic teenager",
                     ],
                     label="Select a Voice",
                     value="scurvy-ridden pirate",

     # Run the graph
     result = graph.invoke(state)
+    descriptions:list[str] = result["descriptions"]
+    description = "\n---\n".join(descriptions)
     # Return the caption and description
+    return result["caption"], description
 def create_interface():
                         "scurvy-ridden pirate",
                         "forgetful wizard",
                         "sarcastic teenager",
+                        "private investigator",
+                        "shakespearian"
                     ],
                     label="Select a Voice",
                     value="scurvy-ridden pirate",