1
File size: 2,453 Bytes
dfb3989
 
8367fb2
 
 
c916589
8367fb2
 
dfb3989
 
 
8367fb2
c916589
8367fb2
 
33fead7
8367fb2
504dc12
33fead7
 
8367fb2
33fead7
8367fb2
b3f64ee
c916589
 
8367fb2
c916589
33fead7
 
c916589
 
 
8367fb2
 
33fead7
c916589
 
 
33fead7
c916589
 
dfb3989
c916589
8367fb2
33fead7
dfb3989
 
33fead7
dfb3989
c916589
 
8367fb2
c916589
 
 
dfb3989
8367fb2
33fead7
dfb3989
33fead7
dfb3989
c916589
dd4f7ba
b3f64ee
c916589
33fead7
b3f64ee
 
dfb3989
 
1c165f8
33fead7
dfb3989
c916589
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py

import streamlit as st
from PIL import Image
from transformers import pipeline
import pyttsx3
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Model loading + warm-up
@st.cache_resource
def load_pipelines():
    # 1) Keep the original BLIP-base for captions
    captioner = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=0  # if you have GPU; use -1 for CPU-only
    )
    # 2) Switch to a lightweight story model
    storyteller = pipeline(
        "text-generation",
        model="EleutherAI/gpt-neo-125M",
        device=0
    )

    # Warm up with a dummy run so first real call is fast
    dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
    captioner(dummy)
    storyteller("Hello", max_new_tokens=1)

    return captioner, storyteller

# —––––––– Initialize local TTS (offline)
@st.cache_resource
def init_tts_engine():
    engine = pyttsx3.init()
    engine.setProperty("rate", 150)   # words per minute
    return engine

captioner, storyteller = load_pipelines()
tts_engine = init_tts_engine()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
    # 1) Resize image to reduce BLIP load
    image = Image.open(uploaded).convert("RGB")
    image = image.resize((384, 384), Image.LANCZOS)
    st.image(image, caption="Your image", use_container_width=True)

    # 2) Caption
    with st.spinner("🔍 Generating caption..."):
        cap = captioner(image)[0]["generated_text"].strip()
    st.markdown(f"**Caption:** {cap}")

    # 3) Story (greedy = fastest)
    prompt = (
        f"Tell an 80–100 word fun story for 3–10 year-olds based on:\n\n“{cap}”\n\nStory:"
    )
    with st.spinner("✍️ Generating story..."):
        out = storyteller(
            prompt,
            max_new_tokens=120,
            do_sample=False
        )
        story = out[0]["generated_text"].strip()
    st.markdown("**Story:**")
    st.write(story)

    # 4) TTS (local, no network)
    with st.spinner("🔊 Converting to speech..."):
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tts_engine.save_to_file(story, tmp.name)
        tts_engine.runAndWait()
    st.audio(tmp.name)