1
File size: 2,305 Bytes
dfb3989
 
8367fb2
 
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
c916589
8367fb2
 
6b1de29
8367fb2
504dc12
33fead7
6b1de29
8367fb2
6b1de29
8367fb2
b3f64ee
c916589
 
8367fb2
c916589
6b1de29
33fead7
c916589
 
 
8367fb2
 
dfb3989
8367fb2
6b1de29
dfb3989
 
6b1de29
dfb3989
c916589
 
8367fb2
6b1de29
c916589
 
dfb3989
8367fb2
6b1de29
dfb3989
6b1de29
 
dfb3989
c916589
dd4f7ba
b3f64ee
c916589
33fead7
b3f64ee
 
dfb3989
 
1c165f8
6b1de29
dfb3989
6b1de29
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# app.py

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Model loading + warm-up
@st.cache_resource
def load_pipelines():
    # 1) Original BLIP-base captioner
    captioner = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=0  # set to -1 if CPU-only
    )
    # 2) Lightweight GPT-Neo for stories
    storyteller = pipeline(
        "text-generation",
        model="EleutherAI/gpt-neo-125M",
        device=0
    )

    # Warm-up so first real request is fast
    dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
    captioner(dummy)
    storyteller("Hello", max_new_tokens=1)

    return captioner, storyteller

captioner, storyteller = load_pipelines()

# —––––––– Image upload & processing
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
    # 1) Load + downsize for faster vision encoding
    image = Image.open(uploaded).convert("RGB")
    image = image.resize((384, 384), Image.LANCZOS)
    st.image(image, caption="Your image", use_container_width=True)

    # 2) Caption step
    with st.spinner("🔍 Generating caption..."):
        cap = captioner(image)[0]["generated_text"].strip()
    st.markdown(f"**Caption:** {cap}")

    # 3) Story generation (greedy for speed)
    prompt = (
        f"Write an 80–100 word playful story for 3–10 year-olds "
        f"based on this description:\n\n“{cap}”\n\nStory:"
    )
    with st.spinner("✍️ Generating story..."):
        out = storyteller(
            prompt,
            max_new_tokens=120,
            do_sample=False
        )
        story = out[0]["generated_text"].strip()
    st.markdown("**Story:**")
    st.write(story)

    # 4) Text-to-Speech via gTTS (network-based)
    with st.spinner("🔊 Converting to speech..."):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")