File size: 2,714 Bytes
dfb3989 8367fb2 6b1de29 8367fb2 dfb3989 8367fb2 121e41f 8367fb2 121e41f 8367fb2 504dc12 33fead7 121e41f 8367fb2 121e41f c916589 8367fb2 121e41f 0fdc556 c916589 121e41f 8367fb2 121e41f 8367fb2 dd489ad cc355a8 dfb3989 121e41f e9fb854 0fdc556 c916589 8367fb2 121e41f c916589 dfb3989 8367fb2 121e41f dfb3989 cc355a8 0fdc556 121e41f cc355a8 eb25a05 dfb3989 cc355a8 121e41f b3f64ee cc355a8 eb25a05 e9fb854 cc355a8 60c225b b3f64ee 121e41f eb25a05 dfb3989 1c165f8 121e41f 6b1de29 e9fb854 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# app.py
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile
# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")
# —––––––– Load & warm pipelines
@st.cache_resource
def load_pipelines():
# 1) BLIP-base for captions
captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=0 # set to -1 if you only have CPU
)
# 2) DeepSeek-R1-Distill (Qwen-1.5B) for stories
ds_storyteller = pipeline(
"text-generation",
model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
trust_remote_code=True,
device=0
)
# Warm-up both so the first real request is faster
dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
captioner(dummy)
ds_storyteller("Warm up", max_new_tokens=1)
return captioner, ds_storyteller
captioner, ds_storyteller = load_pipelines()
# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
# 1) Preprocess & display
image = Image.open(uploaded).convert("RGB")
image = image.resize((384, 384), Image.LANCZOS)
st.image(image, caption="Your image", use_container_width=True)
# 2) Generate caption
with st.spinner("🔍 Generating caption..."):
cap = captioner(image)[0]["generated_text"].strip()
st.markdown(f"**Caption:** {cap}")
# 3) Build prompt
prompt = (
f"Here is an image description: “{cap}”.\n"
"Write an 80–100 word playful story for 3–10 year-old children that:\n"
"1) Describes the scene and main subject.\n"
"2) Explains what it’s doing and how it feels.\n"
"3) Concludes with a fun, imaginative ending.\n\n"
"Story:"
)
# 4) Generate story via DeepSeek
with st.spinner("✍️ Generating story with DeepSeek..."):
out = ds_storyteller(
prompt,
max_new_tokens=120,
do_sample=True,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.2,
no_repeat_ngram_size=3
)
story = out[0]["generated_text"].strip()
st.markdown("**Story:**")
st.write(story)
# 5) Text-to-Speech via gTTS
with st.spinner("🔊 Converting to speech..."):
tts = gTTS(text=story, lang="en")
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.write_to_fp(tmp)
tmp.flush()
st.audio(tmp.name, format="audio/mp3")
|