File size: 2,695 Bytes
dfb3989 8367fb2 6b1de29 8367fb2 dfb3989 8367fb2 e9fb854 8367fb2 e9fb854 8367fb2 504dc12 33fead7 eb25a05 8367fb2 e9fb854 8367fb2 eb25a05 e9fb854 c916589 8367fb2 c916589 e9fb854 c916589 e9fb854 c916589 8367fb2 dfb3989 8367fb2 dd489ad eb25a05 dfb3989 e9fb854 c916589 8367fb2 eb25a05 c916589 dfb3989 8367fb2 e9fb854 dfb3989 e9fb854 eb25a05 dfb3989 e9fb854 dd4f7ba b3f64ee e9fb854 eb25a05 e9fb854 eb25a05 b3f64ee e9fb854 eb25a05 dfb3989 1c165f8 e9fb854 dfb3989 6b1de29 e9fb854 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# app.py
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile
# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")
# —––––––– Load & warm models
@st.cache_resource
def load_pipelines():
# 1) BLIP-base for image captions
captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=0
)
# 2) Flan-T5-Large for instruction following
storyteller = pipeline(
"text2text-generation",
model="google/flan-t5-large",
device=0
)
# Warm up so first real call is faster
dummy = Image.new("RGB", (384, 384), color=(128,128,128))
captioner(dummy)
storyteller("Hello", max_new_tokens=1)
return captioner, storyteller
captioner, storyteller = load_pipelines()
# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg","jpeg","png"])
if uploaded:
# 1) Preprocess + display
image = Image.open(uploaded).convert("RGB")
image = image.resize((384,384), Image.LANCZOS)
st.image(image, caption="Your image", use_container_width=True)
# 2) Caption
with st.spinner("🔍 Generating caption..."):
cap = captioner(image)[0]["generated_text"].strip()
st.markdown(f"**Caption:** {cap}")
# 3) Story — stronger, clearer prompt
prompt = (
f"Here’s an image description: “{cap}”.\n\n"
"Write a playful, 80–100 word story for 3–10 year-old children.\n"
"- Focus only on the panda and what it’s doing.\n"
"- Do not introduce any other characters (no kids, no parents).\n"
"- Be vivid: mention the panda’s feelings or the crunchy meat.\n\n"
"Story:"
)
with st.spinner("✍️ Writing story..."):
out = storyteller(
prompt,
max_new_tokens=130,
do_sample=True,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.3,
no_repeat_ngram_size=3
)
# strip prompt prefix — keep only the generated story
raw = out[0]["generated_text"]
story = raw.split("Story:")[-1].strip()
st.markdown("**Story:**")
st.write(story)
# 4) Text-to-Speech (gTTS)
with st.spinner("🔊 Converting to speech..."):
tts = gTTS(text=story, lang="en")
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.write_to_fp(tmp)
tmp.flush()
st.audio(tmp.name, format="audio/mp3")
|