Spaces:

krsnewwave
/

fun-image-caption

Sleeping

App Files Files Community

Dylan commited on Mar 24

Commit

b5b9453

1 Parent(s): 7d14b9f

some formatting

Browse files

Files changed (2) hide show

agents.py +24 -31
app.py +17 -11

agents.py CHANGED Viewed

@@ -24,7 +24,7 @@ def get_quantization_config():
 # Define the state schema
 class State(TypedDict):
     image: Any
-    voice: str
     caption: str
     descriptions: Annotated[list, operator.add]
@@ -40,7 +40,6 @@ def build_graph():
     workflow.set_entry_point("caption_image")
     workflow.add_conditional_edges("caption_image", map_describe, ["describe_with_voice"])
-    # workflow.add_edge("caption_image", "describe_with_voice")
     workflow.add_edge("describe_with_voice", END)
     # Compile the graph
@@ -59,23 +58,10 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
 ).eval()
-def describe_with_voice_dummy(state: State) -> State:
-    print("Describe")
-    voice = state["voice"]
-    state["description"] = f"Dummy description from {voice}"
-    return state
-def caption_image_dummy(state: State) -> State:
-    print("Caption")
-    voice = state["voice"]
-    state["caption"] = f"Dummy caption from {voice}"
-    return state
-def describe_with_voice(state: State) -> State:
     caption = state["caption"]
-    voice = state["voice"]
     # Voice prompt templates
     voice_prompts = {
@@ -108,24 +94,33 @@ def describe_with_voice(state: State) -> State:
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        generation = model.generate(**inputs, max_new_tokens=1000, do_sample=True, temperature=0.7)
         generation = generation[0][input_len:]
     description = processor.decode(generation, skip_special_tokens=True)
-    # note that the return value is a list
-    state["description"] = [description]
-    print(description)
-    return state
 def map_describe(state: State) -> list:
-    # return list of `Send ` objects (3)
-    return [Send("describe_with_voice", {"caption" : state["caption"], "voice": state["voice"]})] * 3
-def caption_image(state: State) -> State:
     # image is PIL
     image = state["image"]
     image = image_to_base64(image)
@@ -163,8 +158,6 @@ def caption_image(state: State) -> State:
         generation = generation[0][input_len:]
     caption = processor.decode(generation, skip_special_tokens=True)
-    state["caption"] = caption
     print(caption)
-    return state

 # Define the state schema
 class State(TypedDict):
     image: Any
+    voices: list
     caption: str
     descriptions: Annotated[list, operator.add]
     workflow.set_entry_point("caption_image")
     workflow.add_conditional_edges("caption_image", map_describe, ["describe_with_voice"])
     workflow.add_edge("describe_with_voice", END)
     # Compile the graph
 ).eval()
+def describe_with_voice(state: State):
     caption = state["caption"]
+    # select one by default shakespeare
+    voice = state.get("voice", state.get("voices", ["shakespearian"])[0])
     # Voice prompt templates
     voice_prompts = {
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        generation = model.generate(**inputs, max_new_tokens=1000, do_sample=True, temperature=0.9)
         generation = generation[0][input_len:]
     description = processor.decode(generation, skip_special_tokens=True)
+    formatted_description = f"#{voice.title()}\n{description}"
+    print(formatted_description)
+    # note that the return value is a list
+    return {"descriptions": [formatted_description]}
 def map_describe(state: State) -> list:
+    # Create a Send object for each selected voice
+    selected_voices = state["voices"]
+    # Generate description tasks for each selected voice
+    send_objects = []
+    for voice in selected_voices:
+        send_objects.append(
+            Send("describe_with_voice", {"caption": state["caption"], "voice": voice})
+        )
+    return send_objects
+def caption_image(state: State):
     # image is PIL
     image = state["image"]
     image = image_to_base64(image)
         generation = generation[0][input_len:]
     caption = processor.decode(generation, skip_special_tokens=True)
     print(caption)
+    return {"caption" : caption}

app.py CHANGED Viewed

@@ -8,9 +8,12 @@ graph = build_graph()
 @spaces.GPU(duration=60)
-def process_and_display(image, voice):
     # Initialize state
-    state = {"image": image, "voice": voice, "caption": "", "description": ""}
     # Run the graph
     result = graph.invoke(state, {"max_concurrency" : 1})
@@ -26,11 +29,13 @@ def create_interface():
     with gr.Blocks() as demo:
         gr.Markdown("# Image Description with Voice Personas")
         gr.Markdown("""
-        This app takes an image and generates a description using a selected voice persona.
         1. Upload an image
-        2. Select a voice persona from the dropdown
         3. Click "Generate Description" to see the results
         """)
         with gr.Row():
@@ -39,19 +44,20 @@ def create_interface():
                 voice_dropdown = gr.Dropdown(
                     choices=[
                         "scurvy-ridden pirate",
-                        "forgetful wizard",
-                        "sarcastic teenager",
                         "private investigator",
                         "shakespearian"
                     ],
-                    label="Select a Voice",
-                    value="scurvy-ridden pirate",
                 )
                 submit_button = gr.Button("Generate Description")
             with gr.Column():
-                caption_output = gr.Textbox(label="Image Caption")
-                description_output = gr.Textbox(label="Voice Description")
         submit_button.click(
             fn=process_and_display,
@@ -66,4 +72,4 @@ def create_interface():
 demo = create_interface()
 if __name__ == "__main__":
-    demo.launch()

 @spaces.GPU(duration=60)
+def process_and_display(image, voices):
+    if not voices:  # If no voices selected
+        return "Please select at least one voice persona.", "No voice personas selected."
     # Initialize state
+    state = {"image": image, "voices": voices, "caption": "", "descriptions": []}
     # Run the graph
     result = graph.invoke(state, {"max_concurrency" : 1})
     with gr.Blocks() as demo:
         gr.Markdown("# Image Description with Voice Personas")
         gr.Markdown("""
+        This app takes an image and generates descriptions using selected voice personas.
         1. Upload an image
+        2. Select voice personas from the multi-select dropdown
         3. Click "Generate Description" to see the results
+        The descriptions will be generated in parallel for all selected voices.
         """)
         with gr.Row():
                 voice_dropdown = gr.Dropdown(
                     choices=[
                         "scurvy-ridden pirate",
                         "private investigator",
+                        "sarcastic teenager",
+                        "forgetful wizard",
                         "shakespearian"
                     ],
+                    label="Select Voice Personas (max 2 recommended)",
+                    multiselect=True,
+                    value=["scurvy-ridden pirate", "private investigator"]
                 )
                 submit_button = gr.Button("Generate Description")
             with gr.Column():
+                caption_output = gr.Textbox(label="Image Caption", lines=4)
+                description_output = gr.Textbox(label="Voice Descriptions", lines=10)
         submit_button.click(
             fn=process_and_display,
 demo = create_interface()
 if __name__ == "__main__":
+    demo.launch()