CR7CAD commited on
Commit
ad4186a
·
verified ·
1 Parent(s): ab8ead3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -121
app.py CHANGED
@@ -1,149 +1,69 @@
1
- # import part
2
  import streamlit as st
3
  from transformers import pipeline
4
  from PIL import Image
5
 
6
- # Set global caching options for Transformers
7
- from transformers import set_caching_enabled
8
- set_caching_enabled(True)
9
-
10
- # function part with caching for better performance
11
- @st.cache_resource
12
- def load_image_captioning_model():
13
- return pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
14
-
15
- @st.cache_resource
16
- def load_text_generator():
17
- return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
18
-
19
- @st.cache_resource
20
- def load_tts_model():
21
- return pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
22
-
23
- # img2text - Using the original model with more constraints
24
  def img2text(image):
25
- # Load the model (cached)
26
- image_to_text = load_image_captioning_model()
27
-
28
- # Strongly limit output length for speed
29
- text = image_to_text(image, max_new_tokens=15)[0]["generated_text"]
30
  return text
31
 
32
- # text2story - Much more constrained for speed
33
  def text2story(text):
34
- # Load the model (cached)
35
- generator = load_text_generator()
36
 
37
- # Very brief prompt to minimize work
38
- prompt = f"Short story about {text}: Once upon a time, "
39
-
40
- # Very constrained parameters for maximum speed
41
  story_result = generator(
42
  prompt,
43
- max_new_tokens=60, # Much shorter output
44
  num_return_sequences=1,
45
  temperature=0.7,
46
- top_k=10, # Lower value = faster
47
- top_p=0.9, # Lower value = faster
48
  do_sample=True
49
  )
50
 
51
- # Extract and clean text
52
  story_text = story_result[0]['generated_text']
53
  story_text = story_text.replace(prompt, "Once upon a time, ")
54
-
55
- # Find a natural ending point
56
- last_period = story_text.rfind('.')
57
- if last_period > 30: # Ensure we have at least some content
58
- story_text = story_text[:last_period + 1]
59
-
60
  return story_text
61
 
62
- # text2audio - Minimal text for faster processing
63
  def text2audio(story_text):
64
- try:
65
- # Load the model (cached)
66
- synthesizer = load_tts_model()
67
-
68
- # Aggressively limit text length to speed up TTS
69
- max_chars = 200 # Much shorter than before
70
- if len(story_text) > max_chars:
71
- last_period = story_text[:max_chars].rfind('.')
72
- if last_period > 0:
73
- story_text = story_text[:last_period + 1]
74
- else:
75
- story_text = story_text[:max_chars]
76
-
77
- # Generate speech
78
- speech = synthesizer(story_text)
79
- return speech
80
-
81
- except Exception as e:
82
- st.error(f"Error generating audio: {str(e)}")
83
- return None
84
 
85
- # Streamlined main UI
86
- st.set_page_config(page_title="Image to Story", page_icon="📚")
87
- st.header("Image to Audio Story")
88
 
89
- # Add info about processing time
90
- st.info("Note: Processing may take some time as the models are loading. Please be patient.")
91
-
92
- # Cache the file uploader state
93
- if "uploaded_file" not in st.session_state:
94
- st.session_state["uploaded_file"] = None
95
-
96
- uploaded_file = st.file_uploader("Select an Image...", key="file_uploader")
97
-
98
- # Process the image if uploaded
99
  if uploaded_file is not None:
100
- st.session_state["uploaded_file"] = uploaded_file
 
101
 
102
- # Display the uploaded image
103
- st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
104
-
105
- # Convert to PIL image
106
  image = Image.open(uploaded_file)
107
 
108
- # Optional processing toggle to let user decide
109
- if st.button("Generate Story and Audio"):
110
- col1, col2 = st.columns(2)
111
-
112
- # Stage 1: Image to Text with minimal output
113
- with col1:
114
- with st.spinner('Captioning image...'):
115
- caption = img2text(image)
116
- st.write(f"**Caption:** {caption}")
117
-
118
- # Stage 2: Text to Story with minimal length
119
- with col2:
120
- with st.spinner('Creating story...'):
121
- story = text2story(caption)
122
- st.write(f"**Story:** {story}")
123
-
124
- # Stage 3: Audio with minimal text
125
- with st.spinner('Generating audio...'):
126
- speech_output = text2audio(story)
127
-
128
- # Display audio immediately
129
- if speech_output is not None:
130
- try:
131
- if 'audio' in speech_output and 'sampling_rate' in speech_output:
132
- st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
133
- elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
134
- st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
135
- elif 'waveform' in speech_output and 'sample_rate' in speech_output:
136
- st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
137
- else:
138
- # Try any array-like data
139
- for key, value in speech_output.items():
140
- if hasattr(value, '__len__') and len(value) > 1000:
141
- sample_rate = speech_output.get('sampling_rate', speech_output.get('sample_rate', 24000))
142
- st.audio(value, sample_rate=sample_rate)
143
- break
144
- else:
145
- st.error("Could not find audio data in the output")
146
- except Exception as e:
147
- st.error(f"Error playing audio: {str(e)}")
148
  else:
149
- st.error("Audio generation failed")
 
 
 
1
+ # Only the two imports you requested
2
  import streamlit as st
3
  from transformers import pipeline
4
  from PIL import Image
5
 
6
+ # Simple image-to-text function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def img2text(image):
8
+ image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
9
+ text = image_to_text(image)[0]["generated_text"]
 
 
 
10
  return text
11
 
12
+ # Simple text-to-story function
13
  def text2story(text):
14
+ generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
15
+ prompt = f"Write a short children's story based on this: {text}. Once upon a time, "
16
 
 
 
 
 
17
  story_result = generator(
18
  prompt,
19
+ max_length=150,
20
  num_return_sequences=1,
21
  temperature=0.7,
 
 
22
  do_sample=True
23
  )
24
 
 
25
  story_text = story_result[0]['generated_text']
26
  story_text = story_text.replace(prompt, "Once upon a time, ")
 
 
 
 
 
 
27
  return story_text
28
 
29
+ # Simple text-to-audio function
30
  def text2audio(story_text):
31
+ synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
32
+ speech = synthesizer(story_text)
33
+ return speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Basic Streamlit interface
36
+ st.title("Image to Audio Story")
37
+ uploaded_file = st.file_uploader("Upload an image")
38
 
 
 
 
 
 
 
 
 
 
 
39
  if uploaded_file is not None:
40
+ # Display image
41
+ st.image(uploaded_file, caption="Uploaded Image")
42
 
43
+ # Convert to PIL Image
 
 
 
44
  image = Image.open(uploaded_file)
45
 
46
+ # Image to Text
47
+ st.write("Generating caption...")
48
+ caption = img2text(image)
49
+ st.write(f"Caption: {caption}")
50
+
51
+ # Text to Story
52
+ st.write("Creating story...")
53
+ story = text2story(caption)
54
+ st.write(f"Story: {story}")
55
+
56
+ # Text to Audio
57
+ st.write("Generating audio...")
58
+ speech_output = text2audio(story)
59
+
60
+ # Play audio
61
+ try:
62
+ if 'audio' in speech_output and 'sampling_rate' in speech_output:
63
+ st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
64
+ elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
65
+ st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  else:
67
+ st.write("Audio generated but could not be played.")
68
+ except Exception as e:
69
+ st.error(f"Error playing audio: {e}")