Spaces:

mayf
/

1

Sleeping

App Files Files Community

mayf commited on Apr 28

Commit

0fdc556

verified ·

1 Parent(s): e9fb854

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -25

app.py CHANGED Viewed

@@ -10,75 +10,75 @@ import tempfile
 st.set_page_config(page_title="Storyteller for Kids", layout="centered")
 st.title("🖼️ ➡️ 📖 Interactive Storyteller")
-# —––––––– Load & warm models
 @st.cache_resource
 def load_pipelines():
-    # 1) BLIP-base for image captions
     captioner = pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
-        device=0
     )
-    # 2) Flan-T5-Large for instruction following
     storyteller = pipeline(
         "text2text-generation",
         model="google/flan-t5-large",
         device=0
     )
-    # Warm up so first real call is faster
-    dummy = Image.new("RGB", (384, 384), color=(128,128,128))
     captioner(dummy)
-    storyteller("Hello", max_new_tokens=1)
     return captioner, storyteller
 captioner, storyteller = load_pipelines()
 # —––––––– Main UI
-uploaded = st.file_uploader("Upload an image:", type=["jpg","jpeg","png"])
 if uploaded:
-    # 1) Preprocess + display
     image = Image.open(uploaded).convert("RGB")
-    image = image.resize((384,384), Image.LANCZOS)
     st.image(image, caption="Your image", use_container_width=True)
-    # 2) Caption
     with st.spinner("🔍 Generating caption..."):
         cap = captioner(image)[0]["generated_text"].strip()
     st.markdown(f"**Caption:** {cap}")
-    # 3) Story — stronger, clearer prompt
     prompt = (
-        f"Here’s an image description: “{cap}”.\n\n"
-        "Write a playful, 80–100 word story for 3–10 year-old children.\n"
-        "- Focus only on the panda and what it’s doing.\n"
-        "- Do not introduce any other characters (no kids, no parents).\n"
-        "- Be vivid: mention the panda’s feelings or the crunchy meat.\n\n"
         "Story:"
     )
-    with st.spinner("✍️ Writing story..."):
         out = storyteller(
             prompt,
-            max_new_tokens=130,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
             top_k=50,
-            repetition_penalty=1.3,
-            no_repeat_ngram_size=3
         )
-        # strip prompt prefix — keep only the generated story
-        raw = out[0]["generated_text"]
-        story = raw.split("Story:")[-1].strip()
     st.markdown("**Story:**")
     st.write(story)
-    # 4) Text-to-Speech (gTTS)
     with st.spinner("🔊 Converting to speech..."):
         tts = gTTS(text=story, lang="en")
         tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
         tts.write_to_fp(tmp)
         tmp.flush()
     st.audio(tmp.name, format="audio/mp3")

 st.set_page_config(page_title="Storyteller for Kids", layout="centered")
 st.title("🖼️ ➡️ 📖 Interactive Storyteller")
+# —––––––– Load & warm pipelines
 @st.cache_resource
 def load_pipelines():
+    # 1) BLIP-base for captions
     captioner = pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
+        device=0  # GPU if available, else set -1 for CPU
     )
+    # 2) Flan-T5-Large for coherent, instruction-driven stories
     storyteller = pipeline(
         "text2text-generation",
         model="google/flan-t5-large",
         device=0
     )
+    # Warm-up so first real inference is faster
+    dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
     captioner(dummy)
+    storyteller("Hello", max_new_tokens=1, return_full_text=False)
     return captioner, storyteller
 captioner, storyteller = load_pipelines()
 # —––––––– Main UI
+uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
 if uploaded:
+    # 1) Preprocess & display
     image = Image.open(uploaded).convert("RGB")
+    image = image.resize((384, 384), Image.LANCZOS)
     st.image(image, caption="Your image", use_container_width=True)
+    # 2) Generate caption
     with st.spinner("🔍 Generating caption..."):
         cap = captioner(image)[0]["generated_text"].strip()
     st.markdown(f"**Caption:** {cap}")
+    # 3) Generate story with stronger prompt & no carry-over
     prompt = (
+        f"Here is an image description: “{cap}”.\n"
+        "Write an 80–100 word playful story for 3–10 year-old children that:\n"
+        "1) Sets the scene with the panda and its surroundings.\n"
+        "2) Describes what the panda is doing and how it feels.\n"
+        "3) Wraps up with a fun conclusion.\n\n"
         "Story:"
     )
+    with st.spinner("✍️ Generating story..."):
         out = storyteller(
             prompt,
+            max_new_tokens=120,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
             top_k=50,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
+            return_full_text=False    # only the story text, not the prompt
         )
+        story = out[0]["generated_text"].strip()
     st.markdown("**Story:**")
     st.write(story)
+    # 4) Text-to-Speech via gTTS
     with st.spinner("🔊 Converting to speech..."):
         tts = gTTS(text=story, lang="en")
         tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
         tts.write_to_fp(tmp)
         tmp.flush()
     st.audio(tmp.name, format="audio/mp3")