1
File size: 2,714 Bytes
dfb3989
 
8367fb2
 
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
121e41f
8367fb2
 
121e41f
8367fb2
504dc12
33fead7
121e41f
8367fb2
121e41f
 
 
 
 
c916589
8367fb2
121e41f
 
0fdc556
c916589
121e41f
 
 
8367fb2
121e41f
8367fb2
dd489ad
cc355a8
dfb3989
121e41f
e9fb854
0fdc556
c916589
8367fb2
121e41f
 
c916589
dfb3989
8367fb2
121e41f
dfb3989
cc355a8
0fdc556
121e41f
 
cc355a8
eb25a05
dfb3989
cc355a8
121e41f
 
 
b3f64ee
cc355a8
eb25a05
 
 
e9fb854
cc355a8
60c225b
b3f64ee
121e41f
eb25a05
dfb3989
 
1c165f8
121e41f
 
6b1de29
 
 
 
e9fb854
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# app.py

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Load & warm pipelines
@st.cache_resource
def load_pipelines():
    # 1) BLIP-base for captions
    captioner = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=0  # set to -1 if you only have CPU
    )
    # 2) DeepSeek-R1-Distill (Qwen-1.5B) for stories
    ds_storyteller = pipeline(
        "text-generation",
        model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True,
        device=0
    )

    # Warm-up both so the first real request is faster
    dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
    captioner(dummy)
    ds_storyteller("Warm up", max_new_tokens=1)

    return captioner, ds_storyteller

captioner, ds_storyteller = load_pipelines()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
    # 1) Preprocess & display
    image = Image.open(uploaded).convert("RGB")
    image = image.resize((384, 384), Image.LANCZOS)
    st.image(image, caption="Your image", use_container_width=True)

    # 2) Generate caption
    with st.spinner("🔍 Generating caption..."):
        cap = captioner(image)[0]["generated_text"].strip()
    st.markdown(f"**Caption:** {cap}")

    # 3) Build prompt
    prompt = (
        f"Here is an image description: “{cap}”.\n"
        "Write an 80–100 word playful story for 3–10 year-old children that:\n"
        "1) Describes the scene and main subject.\n"
        "2) Explains what it’s doing and how it feels.\n"
        "3) Concludes with a fun, imaginative ending.\n\n"
        "Story:"
    )

    # 4) Generate story via DeepSeek
    with st.spinner("✍️ Generating story with DeepSeek..."):
        out = ds_storyteller(
            prompt,
            max_new_tokens=120,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3
        )
        story = out[0]["generated_text"].strip()

    st.markdown("**Story:**")
    st.write(story)

    # 5) Text-to-Speech via gTTS
    with st.spinner("🔊 Converting to speech..."):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")