CR7CAD commited on
Commit
a4fc174
·
verified ·
1 Parent(s): 9e7cf7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -43
app.py CHANGED
@@ -1,29 +1,26 @@
1
  # import part
2
  import streamlit as st
3
  from transformers import pipeline
4
- from PIL import Image
5
 
6
  # function part
7
- # img2text - Using the original model
8
- def img2text(image):
9
- # Use the specified model but with optimized parameters
10
  image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
11
- # Limiting the output length for speed
12
- text = image_to_text(image, max_new_tokens=30)[0]["generated_text"]
13
  return text
14
 
15
- # text2story - Using the original model but with optimized parameters
16
  def text2story(text):
17
- # Using the specified TinyLlama model
18
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
19
 
20
  # Create a prompt for the story generation
21
- prompt = f"Write a brief children's story based on this: {text}. Once upon a time, "
22
 
23
- # Generate with more constrained parameters for speed
24
  story_result = generator(
25
  prompt,
26
- max_new_tokens=150, # Use max_new_tokens instead of max_length for efficiency
27
  num_return_sequences=1,
28
  temperature=0.7,
29
  top_k=50,
@@ -58,7 +55,7 @@ def text2story(text):
58
 
59
  return story_text
60
 
61
- # text2audio - Using HelpingAI-TTS-v1 model
62
  def text2audio(story_text):
63
  try:
64
  # Use the HelpingAI TTS model as requested
@@ -74,14 +71,32 @@ def text2audio(story_text):
74
  story_text = story_text[:max_chars]
75
 
76
  # Generate speech
 
77
  speech = synthesizer(story_text)
 
78
 
 
 
79
  return speech
80
 
81
  except Exception as e:
82
  st.error(f"Error generating audio: {str(e)}")
 
 
83
  return None
84
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # main part
86
  st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
87
  st.header("Turn Your Image to Audio Story")
@@ -91,28 +106,22 @@ if uploaded_file is not None:
91
  # Display the uploaded image
92
  st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
93
 
94
- # Convert the file to a PIL Image
95
- image = Image.open(uploaded_file)
96
-
97
- # Progress indicator
98
- progress_bar = st.progress(0)
99
 
100
  # Stage 1: Image to Text
101
- with st.spinner('Processing image caption...'):
102
- caption = img2text(image)
103
- progress_bar.progress(33)
104
- st.write(f"**Image caption:** {caption}")
105
 
106
  # Stage 2: Text to Story
107
- with st.spinner('Creating story...'):
108
- story = text2story(caption)
109
- progress_bar.progress(66)
110
- st.write(f"**Story:** {story}")
111
 
112
  # Stage 3: Story to Audio data
113
- with st.spinner('Generating audio...'):
114
- speech_output = text2audio(story)
115
- progress_bar.progress(100)
116
 
117
  # Play button
118
  if st.button("Play Audio"):
@@ -126,21 +135,14 @@ if uploaded_file is not None:
126
  elif 'waveform' in speech_output and 'sample_rate' in speech_output:
127
  st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
128
  else:
129
- # Try the first array-like value as audio data
130
- for key, value in speech_output.items():
131
- if hasattr(value, '__len__') and len(value) > 1000:
132
- if 'rate' in speech_output:
133
- st.audio(value, sample_rate=speech_output['rate'])
134
- elif 'sample_rate' in speech_output:
135
- st.audio(value, sample_rate=speech_output['sample_rate'])
136
- elif 'sampling_rate' in speech_output:
137
- st.audio(value, sample_rate=speech_output['sampling_rate'])
138
- else:
139
- st.audio(value, sample_rate=24000) # Default sample rate
140
- break
141
- else:
142
- st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
143
  except Exception as e:
144
  st.error(f"Error playing audio: {str(e)}")
145
  else:
146
- st.error("Audio generation failed. Please try again.")
 
 
 
 
 
 
 
1
  # import part
2
  import streamlit as st
3
  from transformers import pipeline
 
4
 
5
  # function part
6
+ # img2text
7
+ def img2text(image_path):
 
8
  image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
9
+ text = image_to_text(image_path)[0]["generated_text"]
 
10
  return text
11
 
12
+ # text2story - IMPROVED to end naturally
13
  def text2story(text):
14
+ # Using a smaller text generation model
15
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
16
 
17
  # Create a prompt for the story generation
18
+ prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
19
 
20
+ # Generate the story
21
  story_result = generator(
22
  prompt,
23
+ max_length=250, # Increased to allow for a complete story
24
  num_return_sequences=1,
25
  temperature=0.7,
26
  top_k=50,
 
55
 
56
  return story_text
57
 
58
+ # text2audio - Simplified without numpy/scipy
59
  def text2audio(story_text):
60
  try:
61
  # Use the HelpingAI TTS model as requested
 
71
  story_text = story_text[:max_chars]
72
 
73
  # Generate speech
74
+ st.write("Generating audio...")
75
  speech = synthesizer(story_text)
76
+ st.write(f"Speech output keys: {list(speech.keys())}")
77
 
78
+ # We'll pass the audio data directly to Streamlit instead of saving to a file
79
+ # This works because Streamlit's st.audio() can take raw audio data
80
  return speech
81
 
82
  except Exception as e:
83
  st.error(f"Error generating audio: {str(e)}")
84
+ import traceback
85
+ st.error(traceback.format_exc())
86
  return None
87
 
88
+ # Function to save temporary image file
89
+ def save_uploaded_image(uploaded_file):
90
+ if not os.path.exists("temp"):
91
+ os.makedirs("temp")
92
+
93
+ image_path = os.path.join("temp", uploaded_file.name)
94
+
95
+ with open(image_path, "wb") as f:
96
+ f.write(uploaded_file.getvalue())
97
+
98
+ return image_path
99
+
100
  # main part
101
  st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
102
  st.header("Turn Your Image to Audio Story")
 
106
  # Display the uploaded image
107
  st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
108
 
109
+ # Save the image temporarily
110
+ image_path = save_uploaded_image(uploaded_file)
 
 
 
111
 
112
  # Stage 1: Image to Text
113
+ st.text('Processing img2text...')
114
+ caption = img2text(image_path)
115
+ st.write(caption)
 
116
 
117
  # Stage 2: Text to Story
118
+ st.text('Generating a story...')
119
+ story = text2story(caption)
120
+ st.write(story)
 
121
 
122
  # Stage 3: Story to Audio data
123
+ st.text('Generating audio data...')
124
+ speech_output = text2audio(story)
 
125
 
126
  # Play button
127
  if st.button("Play Audio"):
 
135
  elif 'waveform' in speech_output and 'sample_rate' in speech_output:
136
  st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
137
  else:
138
+ st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  except Exception as e:
140
  st.error(f"Error playing audio: {str(e)}")
141
  else:
142
+ st.error("Audio generation failed. Please try again.")
143
+
144
+ # Clean up the temporary files
145
+ try:
146
+ os.remove(image_path)
147
+ except:
148
+ pass