1
File size: 2,695 Bytes
dfb3989
 
8367fb2
 
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
e9fb854
8367fb2
 
e9fb854
8367fb2
504dc12
33fead7
eb25a05
8367fb2
e9fb854
8367fb2
eb25a05
e9fb854
c916589
8367fb2
c916589
e9fb854
 
c916589
e9fb854
c916589
8367fb2
 
dfb3989
8367fb2
dd489ad
eb25a05
dfb3989
e9fb854
 
 
c916589
8367fb2
eb25a05
c916589
 
dfb3989
8367fb2
e9fb854
dfb3989
e9fb854
 
 
 
 
eb25a05
dfb3989
e9fb854
dd4f7ba
b3f64ee
e9fb854
eb25a05
 
 
e9fb854
 
eb25a05
b3f64ee
e9fb854
 
 
eb25a05
dfb3989
 
1c165f8
e9fb854
dfb3989
6b1de29
 
 
 
e9fb854
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# app.py

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Load & warm models
@st.cache_resource
def load_pipelines():
    # 1) BLIP-base for image captions
    captioner = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=0
    )
    # 2) Flan-T5-Large for instruction following
    storyteller = pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device=0
    )

    # Warm up so first real call is faster
    dummy = Image.new("RGB", (384, 384), color=(128,128,128))
    captioner(dummy)
    storyteller("Hello", max_new_tokens=1)

    return captioner, storyteller

captioner, storyteller = load_pipelines()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg","jpeg","png"])
if uploaded:
    # 1) Preprocess + display
    image = Image.open(uploaded).convert("RGB")
    image = image.resize((384,384), Image.LANCZOS)
    st.image(image, caption="Your image", use_container_width=True)

    # 2) Caption
    with st.spinner("🔍 Generating caption..."):
        cap = captioner(image)[0]["generated_text"].strip()
    st.markdown(f"**Caption:** {cap}")

    # 3) Story — stronger, clearer prompt
    prompt = (
        f"Here’s an image description: “{cap}”.\n\n"
        "Write a playful, 80–100 word story for 3–10 year-old children.\n"
        "- Focus only on the panda and what it’s doing.\n"
        "- Do not introduce any other characters (no kids, no parents).\n"
        "- Be vivid: mention the panda’s feelings or the crunchy meat.\n\n"
        "Story:"
    )
    with st.spinner("✍️ Writing story..."):
        out = storyteller(
            prompt,
            max_new_tokens=130,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.3,
            no_repeat_ngram_size=3
        )
        # strip prompt prefix — keep only the generated story
        raw = out[0]["generated_text"]
        story = raw.split("Story:")[-1].strip()

    st.markdown("**Story:**")
    st.write(story)

    # 4) Text-to-Speech (gTTS)
    with st.spinner("🔊 Converting to speech..."):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")