File size: 2,453 Bytes
dfb3989 8367fb2 c916589 8367fb2 dfb3989 8367fb2 c916589 8367fb2 33fead7 8367fb2 504dc12 33fead7 8367fb2 33fead7 8367fb2 b3f64ee c916589 8367fb2 c916589 33fead7 c916589 8367fb2 33fead7 c916589 33fead7 c916589 dfb3989 c916589 8367fb2 33fead7 dfb3989 33fead7 dfb3989 c916589 8367fb2 c916589 dfb3989 8367fb2 33fead7 dfb3989 33fead7 dfb3989 c916589 dd4f7ba b3f64ee c916589 33fead7 b3f64ee dfb3989 1c165f8 33fead7 dfb3989 c916589 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# app.py
import streamlit as st
from PIL import Image
from transformers import pipeline
import pyttsx3
import tempfile
# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")
# —––––––– Model loading + warm-up
@st.cache_resource
def load_pipelines():
# 1) Keep the original BLIP-base for captions
captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=0 # if you have GPU; use -1 for CPU-only
)
# 2) Switch to a lightweight story model
storyteller = pipeline(
"text-generation",
model="EleutherAI/gpt-neo-125M",
device=0
)
# Warm up with a dummy run so first real call is fast
dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
captioner(dummy)
storyteller("Hello", max_new_tokens=1)
return captioner, storyteller
# —––––––– Initialize local TTS (offline)
@st.cache_resource
def init_tts_engine():
engine = pyttsx3.init()
engine.setProperty("rate", 150) # words per minute
return engine
captioner, storyteller = load_pipelines()
tts_engine = init_tts_engine()
# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
# 1) Resize image to reduce BLIP load
image = Image.open(uploaded).convert("RGB")
image = image.resize((384, 384), Image.LANCZOS)
st.image(image, caption="Your image", use_container_width=True)
# 2) Caption
with st.spinner("🔍 Generating caption..."):
cap = captioner(image)[0]["generated_text"].strip()
st.markdown(f"**Caption:** {cap}")
# 3) Story (greedy = fastest)
prompt = (
f"Tell an 80–100 word fun story for 3–10 year-olds based on:\n\n“{cap}”\n\nStory:"
)
with st.spinner("✍️ Generating story..."):
out = storyteller(
prompt,
max_new_tokens=120,
do_sample=False
)
story = out[0]["generated_text"].strip()
st.markdown("**Story:**")
st.write(story)
# 4) TTS (local, no network)
with st.spinner("🔊 Converting to speech..."):
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tts_engine.save_to_file(story, tmp.name)
tts_engine.runAndWait()
st.audio(tmp.name)
|