CR7CAD commited on
Commit
a79c9ac
·
verified ·
1 Parent(s): 5f21a2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -91
app.py CHANGED
@@ -1,8 +1,6 @@
1
- # import part
2
  import streamlit as st
3
  from transformers import pipeline
4
- import os
5
- import tempfile
6
 
7
  # function part
8
  # img2text
@@ -11,18 +9,18 @@ def img2text(image_path):
11
  text = image_to_text(image_path)[0]["generated_text"]
12
  return text
13
 
14
- # text2story
15
  def text2story(text):
16
  # Using a smaller text generation model
17
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
18
 
19
  # Create a prompt for the story generation
20
- prompt = f"Write a fun children's story based on this: {text}. Once upon a time, "
21
 
22
  # Generate the story
23
  story_result = generator(
24
  prompt,
25
- max_length=150,
26
  num_return_sequences=1,
27
  temperature=0.7,
28
  top_k=50,
@@ -34,25 +32,34 @@ def text2story(text):
34
  story_text = story_result[0]['generated_text']
35
  story_text = story_text.replace(prompt, "Once upon a time, ")
36
 
37
- # Make sure the story is at least 100 words
38
  words = story_text.split()
39
  if len(words) > 100:
40
- # Simply truncate to 100 words
41
- story_text = " ".join(words[:100])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  return story_text
44
 
45
- # text2audio - REVISED to correctly handle the audio output
46
  def text2audio(story_text):
47
  try:
48
- # Use a different TTS model that works reliably with pipeline
49
- synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
50
-
51
- # Additional input required for this model
52
- speaker_embeddings = pipeline(
53
- "audio-classification",
54
- model="microsoft/speecht5_speaker_embeddings"
55
- )("some_audio_file.mp3")["logits"]
56
 
57
  # Limit text length to avoid timeouts
58
  max_chars = 500
@@ -63,71 +70,18 @@ def text2audio(story_text):
63
  else:
64
  story_text = story_text[:max_chars]
65
 
66
- # Generate speech with correct parameters
67
- speech = synthesizer(
68
- text=story_text,
69
- forward_params={"speaker_embeddings": speaker_embeddings}
70
- )
71
 
72
- # Create a temporary WAV file
73
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
74
- temp_filename = temp_file.name
75
- temp_file.close()
76
-
77
- # Display the structure of the speech output for debugging
78
- st.write(f"Speech output keys: {speech.keys()}")
79
-
80
- # Save the audio data to the temporary file
81
- # Different models have different output formats, we'll try common keys
82
- if 'audio' in speech:
83
- # Convert numpy array to WAV file
84
- try:
85
- import scipy.io.wavfile as wavfile
86
- wavfile.write(temp_filename, speech['sampling_rate'], speech['audio'])
87
- except ImportError:
88
- # If scipy is not available, try raw writing
89
- with open(temp_filename, 'wb') as f:
90
- # Convert numpy array to bytes in a simple way
91
- if isinstance(speech['audio'], np.ndarray):
92
- audio_bytes = speech['audio'].tobytes()
93
- f.write(audio_bytes)
94
- else:
95
- f.write(speech['audio'])
96
- elif 'numpy_array' in speech:
97
- with open(temp_filename, 'wb') as f:
98
- f.write(speech['numpy_array'].tobytes())
99
- else:
100
- # Fallback: try to write whatever is available
101
- with open(temp_filename, 'wb') as f:
102
- # Just write the first value that seems like it could be audio data
103
- for key, value in speech.items():
104
- if isinstance(value, (bytes, bytearray)) or (
105
- isinstance(value, np.ndarray) and value.size > 1000):
106
- if isinstance(value, np.ndarray):
107
- f.write(value.tobytes())
108
- else:
109
- f.write(value)
110
- break
111
 
112
- return temp_filename
113
 
114
  except Exception as e:
115
  st.error(f"Error generating audio: {str(e)}")
116
- # Print all available keys for debugging
117
  return None
118
 
119
- # Function to save temporary image file
120
- def save_uploaded_image(uploaded_file):
121
- if not os.path.exists("temp"):
122
- os.makedirs("temp")
123
-
124
- image_path = os.path.join("temp", uploaded_file.name)
125
-
126
- with open(image_path, "wb") as f:
127
- f.write(uploaded_file.getvalue())
128
-
129
- return image_path
130
-
131
  # main part
132
  st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
133
  st.header("Turn Your Image to Audio Story")
@@ -137,12 +91,12 @@ if uploaded_file is not None:
137
  # Display the uploaded image
138
  st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
139
 
140
- # Save the image temporarily
141
- image_path = save_uploaded_image(uploaded_file)
142
 
143
  # Stage 1: Image to Text
144
  st.text('Processing img2text...')
145
- caption = img2text(image_path)
146
  st.write(caption)
147
 
148
  # Stage 2: Text to Story
@@ -152,19 +106,35 @@ if uploaded_file is not None:
152
 
153
  # Stage 3: Story to Audio data
154
  st.text('Generating audio data...')
155
- audio_file = text2audio(story)
156
 
157
  # Play button
158
  if st.button("Play Audio"):
159
- if audio_file and os.path.exists(audio_file):
160
- # Play the audio file
161
- st.audio(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  else:
163
- st.error("Audio generation failed. Please try again.")
164
-
165
- # Clean up the temporary files
166
- try:
167
- os.remove(image_path)
168
- # Don't delete audio file immediately as it might still be playing
169
- except:
170
- pass
 
1
+ # import part - only using the two requested imports
2
  import streamlit as st
3
  from transformers import pipeline
 
 
4
 
5
  # function part
6
  # img2text
 
9
  text = image_to_text(image_path)[0]["generated_text"]
10
  return text
11
 
12
+ # text2story - IMPROVED to end naturally
13
  def text2story(text):
14
  # Using a smaller text generation model
15
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
16
 
17
  # Create a prompt for the story generation
18
+ prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
19
 
20
  # Generate the story
21
  story_result = generator(
22
  prompt,
23
+ max_length=250, # Increased to allow for a complete story
24
  num_return_sequences=1,
25
  temperature=0.7,
26
  top_k=50,
 
32
  story_text = story_result[0]['generated_text']
33
  story_text = story_text.replace(prompt, "Once upon a time, ")
34
 
35
+ # Find a natural ending point (end of sentence) before 100 words
36
  words = story_text.split()
37
  if len(words) > 100:
38
+ # Join the first 100 words
39
+ shortened_text = " ".join(words[:100])
40
+
41
+ # Find the last complete sentence
42
+ last_period = shortened_text.rfind('.')
43
+ last_question = shortened_text.rfind('?')
44
+ last_exclamation = shortened_text.rfind('!')
45
+
46
+ # Find the last sentence ending punctuation
47
+ last_end = max(last_period, last_question, last_exclamation)
48
+
49
+ if last_end > 0:
50
+ # Truncate at the end of the last complete sentence
51
+ story_text = shortened_text[:last_end + 1]
52
+ else:
53
+ # If no sentence ending found, just use the shortened text
54
+ story_text = shortened_text
55
 
56
  return story_text
57
 
58
+ # text2audio - Using HelpingAI-TTS-v1 model
59
  def text2audio(story_text):
60
  try:
61
+ # Use the HelpingAI TTS model as requested
62
+ synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
 
 
 
 
 
 
63
 
64
  # Limit text length to avoid timeouts
65
  max_chars = 500
 
70
  else:
71
  story_text = story_text[:max_chars]
72
 
73
+ # Generate speech
74
+ speech = synthesizer(story_text)
 
 
 
75
 
76
+ # Get output information
77
+ st.write(f"Speech output keys: {list(speech.keys())}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ return speech
80
 
81
  except Exception as e:
82
  st.error(f"Error generating audio: {str(e)}")
 
83
  return None
84
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # main part
86
  st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
87
  st.header("Turn Your Image to Audio Story")
 
91
  # Display the uploaded image
92
  st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
93
 
94
+ # Create a temporary file in memory from the uploaded file
95
+ image_bytes = uploaded_file.getvalue()
96
 
97
  # Stage 1: Image to Text
98
  st.text('Processing img2text...')
99
+ caption = img2text(image_bytes) # Pass bytes directly to pipeline
100
  st.write(caption)
101
 
102
  # Stage 2: Text to Story
 
106
 
107
  # Stage 3: Story to Audio data
108
  st.text('Generating audio data...')
109
+ speech_output = text2audio(story)
110
 
111
  # Play button
112
  if st.button("Play Audio"):
113
+ if speech_output is not None:
114
+ # Try to play the audio directly
115
+ try:
116
+ if 'audio' in speech_output and 'sampling_rate' in speech_output:
117
+ st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
118
+ elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
119
+ st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
120
+ elif 'waveform' in speech_output and 'sample_rate' in speech_output:
121
+ st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
122
+ else:
123
+ # Try the first array-like value as audio data
124
+ for key, value in speech_output.items():
125
+ if hasattr(value, '__len__') and len(value) > 1000:
126
+ if 'rate' in speech_output:
127
+ st.audio(value, sample_rate=speech_output['rate'])
128
+ elif 'sample_rate' in speech_output:
129
+ st.audio(value, sample_rate=speech_output['sample_rate'])
130
+ elif 'sampling_rate' in speech_output:
131
+ st.audio(value, sample_rate=speech_output['sampling_rate'])
132
+ else:
133
+ st.audio(value, sample_rate=24000) # Default sample rate
134
+ break
135
+ else:
136
+ st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
137
+ except Exception as e:
138
+ st.error(f"Error playing audio: {str(e)}")
139
  else:
140
+ st.error("Audio generation failed. Please try again.")