File size: 3,225 Bytes
dfb3989 8367fb2 7d2ac1c 6b1de29 8367fb2 dfb3989 8367fb2 7d2ac1c 8367fb2 7d2ac1c 8367fb2 7d2ac1c 8367fb2 7d2ac1c 121e41f 7d2ac1c 8367fb2 dd489ad cc355a8 7d2ac1c 8367fb2 7d2ac1c 121e41f 7d2ac1c 8367fb2 121e41f dfb3989 7d2ac1c 0fdc556 121e41f cc355a8 eb25a05 dfb3989 cc355a8 7d2ac1c b3f64ee 7d2ac1c dfb3989 1c165f8 121e41f 6b1de29 e9fb854 7d2ac1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# app.py
import streamlit as st
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceApi
from gtts import gTTS
import tempfile
# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")
# —––––––– Inference clients (cached)
@st.cache_resource
def load_clients():
# read your HF token from Space secrets
hf_token = st.secrets["HF_TOKEN"]
# caption client: BLIP-base via HF Image-to-Text API
caption_client = InferenceApi(
repo_id="Salesforce/blip-image-captioning-base",
task="image-to-text",
token=hf_token
)
# story client: DeepSeek-R1-Distill via HF Text-Generation API
story_client = InferenceApi(
repo_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
task="text-generation",
token=hf_token
)
return caption_client, story_client
caption_client, story_client = load_clients()
# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if not uploaded:
st.info("Please upload an image (JPG/PNG) to begin.")
else:
# 1) Display the image
img = Image.open(uploaded).convert("RGB")
st.image(img, use_container_width=True)
# 2) Caption via HF Inference API
with st.spinner("🔍 Generating caption..."):
buf = BytesIO()
img.save(buf, format="PNG")
caption_output = caption_client(data=buf.getvalue())
# handle API return formats
if isinstance(caption_output, dict):
cap_text = caption_output.get("generated_text", "").strip()
else:
cap_text = str(caption_output).strip()
st.markdown(f"**Caption:** {cap_text}")
# 3) Build prompt
prompt = (
f"Here’s an image description: “{cap_text}”.\n\n"
"Write an 80–100 word playful story for 3–10 year-old children that:\n"
"1) Describes the scene and main subject.\n"
"2) Explains what it’s doing and how it feels.\n"
"3) Concludes with a fun, imaginative ending.\n\n"
"Story:"
)
# 4) Story via HF Inference API
with st.spinner("✍️ Generating story..."):
story_output = story_client(
inputs=prompt,
params={
"max_new_tokens": 120,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.2,
"no_repeat_ngram_size": 3
}
)
# API returns list of generations or a dict
if isinstance(story_output, list):
story = story_output[0].get("generated_text", "").strip()
else:
story = story_output.get("generated_text", "").strip()
st.markdown("**Story:**")
st.write(story)
# 5) Text-to-Speech via gTTS
with st.spinner("🔊 Converting to speech..."):
tts = gTTS(text=story, lang="en")
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.write_to_fp(tmp)
tmp.flush()
st.audio(tmp.name, format="audio/mp3")
|