File size: 3,460 Bytes
dfb3989 8367fb2 7d2ac1c 6b1de29 8367fb2 dfb3989 8367fb2 7d2ac1c 8367fb2 7d2ac1c 8367fb2 7d2ac1c 8367fb2 7d2ac1c 121e41f 7d2ac1c 8367fb2 dd489ad 258bc7e 7d2ac1c 258bc7e 7d2ac1c 258bc7e 7d2ac1c 8367fb2 258bc7e 121e41f 7d2ac1c 258bc7e 7d2ac1c 258bc7e 7d2ac1c 8367fb2 258bc7e dfb3989 7d2ac1c 0fdc556 121e41f cc355a8 eb25a05 dfb3989 cc355a8 258bc7e 7d2ac1c 258bc7e 7d2ac1c 258bc7e 7d2ac1c b3f64ee 258bc7e 7d2ac1c 258bc7e dfb3989 1c165f8 121e41f 6b1de29 e9fb854 7d2ac1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# app.py
import streamlit as st
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceApi
from gtts import gTTS
import tempfile
# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")
# —––––––– Inference clients (cached)
@st.cache_resource
def load_clients():
hf_token = st.secrets["HF_TOKEN"]
caption_client = InferenceApi(
repo_id="Salesforce/blip-image-captioning-base",
task="image-to-text",
token=hf_token
)
story_client = InferenceApi(
repo_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
task="text-generation",
token=hf_token
)
return caption_client, story_client
caption_client, story_client = load_clients()
# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg","jpeg","png"])
if not uploaded:
st.info("Please upload a JPG/PNG image to begin.")
else:
# 1) Display image
img = Image.open(uploaded).convert("RGB")
st.image(img, use_container_width=True)
# 2) Generate caption
with st.spinner("🔍 Generating caption..."):
buf = BytesIO()
img.save(buf, format="PNG")
cap_out = caption_client(data=buf.getvalue())
# Correctly extract from list/dict
if isinstance(cap_out, list) and cap_out:
cap_text = cap_out[0].get("generated_text", "").strip()
elif isinstance(cap_out, dict):
cap_text = cap_out.get("generated_text", "").strip()
else:
cap_text = str(cap_out).strip()
if not cap_text:
st.error("😕 I couldn’t generate a caption. Try uploading a different image.")
st.stop()
st.markdown(f"**Caption:** {cap_text}")
# 3) Build prompt for story
prompt = (
f"Here’s an image description: “{cap_text}”.\n\n"
"Write an 80–100 word playful story for 3–10 year-old children that:\n"
"1) Describes the scene and main subject.\n"
"2) Explains what it’s doing and how it feels.\n"
"3) Concludes with a fun, imaginative ending.\n\n"
"Story:"
)
# 4) Generate story
with st.spinner("✍️ Generating story..."):
story_out = story_client(
inputs=prompt,
parameters={ # must be `parameters`, not `params`
"max_new_tokens": 120,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.2,
"no_repeat_ngram_size": 3
}
)
if isinstance(story_out, list) and story_out:
story = story_out[0].get("generated_text", "").strip()
elif isinstance(story_out, dict):
story = story_out.get("generated_text", "").strip()
else:
story = str(story_out).strip()
if not story:
st.error("😕 I couldn’t generate a story. Please try again!")
st.stop()
st.markdown("**Story:**")
st.write(story)
# 5) Text-to-Speech via gTTS
with st.spinner("🔊 Converting to speech..."):
tts = gTTS(text=story, lang="en")
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.write_to_fp(tmp)
tmp.flush()
st.audio(tmp.name, format="audio/mp3")
|