Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

5f21a2d

verified ·

1 Parent(s): 83842b8

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -11

app.py CHANGED Viewed

@@ -12,10 +12,47 @@ def img2text(image_path):
     return text
 # text2story
 def text2audio(story_text):
     try:
-        # Use the HelpingAI TTS model as requested
-        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
         # Limit text length to avoid timeouts
         max_chars = 500
@@ -26,19 +63,57 @@ def text2audio(story_text):
             else:
                 story_text = story_text[:max_chars]
-        # Generate speech
-        st.write("Generating audio...")
-        speech = synthesizer(story_text)
-        st.write(f"Speech output keys: {list(speech.keys())}")
-        # We'll pass the audio data directly to Streamlit instead of saving to a file
-        # This works because Streamlit's st.audio() can take raw audio data
-        return speech
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
-        import traceback
-        st.error(traceback.format_exc())
         return None
 # Function to save temporary image file

     return text
 # text2story
+def text2story(text):
+    # Using a smaller text generation model
+    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    # Create a prompt for the story generation
+    prompt = f"Write a fun children's story based on this: {text}. Once upon a time, "
+    # Generate the story
+    story_result = generator(
+        prompt,
+        max_length=150,
+        num_return_sequences=1,
+        temperature=0.7,
+        top_k=50,
+        top_p=0.95,
+        do_sample=True
+    )
+    # Extract the generated text
+    story_text = story_result[0]['generated_text']
+    story_text = story_text.replace(prompt, "Once upon a time, ")
+    # Make sure the story is at least 100 words
+    words = story_text.split()
+    if len(words) > 100:
+        # Simply truncate to 100 words
+        story_text = " ".join(words[:100])
+    return story_text
+# text2audio - REVISED to correctly handle the audio output
 def text2audio(story_text):
     try:
+        # Use a different TTS model that works reliably with pipeline
+        synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+        # Additional input required for this model
+        speaker_embeddings = pipeline(
+            "audio-classification",
+            model="microsoft/speecht5_speaker_embeddings"
+        )("some_audio_file.mp3")["logits"]
         # Limit text length to avoid timeouts
         max_chars = 500
             else:
                 story_text = story_text[:max_chars]
+        # Generate speech with correct parameters
+        speech = synthesizer(
+            text=story_text,
+            forward_params={"speaker_embeddings": speaker_embeddings}
+        )
+        # Create a temporary WAV file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+        temp_filename = temp_file.name
+        temp_file.close()
+        # Display the structure of the speech output for debugging
+        st.write(f"Speech output keys: {speech.keys()}")
+        # Save the audio data to the temporary file
+        # Different models have different output formats, we'll try common keys
+        if 'audio' in speech:
+            # Convert numpy array to WAV file
+            try:
+                import scipy.io.wavfile as wavfile
+                wavfile.write(temp_filename, speech['sampling_rate'], speech['audio'])
+            except ImportError:
+                # If scipy is not available, try raw writing
+                with open(temp_filename, 'wb') as f:
+                    # Convert numpy array to bytes in a simple way
+                    if isinstance(speech['audio'], np.ndarray):
+                        audio_bytes = speech['audio'].tobytes()
+                        f.write(audio_bytes)
+                    else:
+                        f.write(speech['audio'])
+        elif 'numpy_array' in speech:
+            with open(temp_filename, 'wb') as f:
+                f.write(speech['numpy_array'].tobytes())
+        else:
+            # Fallback: try to write whatever is available
+            with open(temp_filename, 'wb') as f:
+                # Just write the first value that seems like it could be audio data
+                for key, value in speech.items():
+                    if isinstance(value, (bytes, bytearray)) or (
+                            isinstance(value, np.ndarray) and value.size > 1000):
+                        if isinstance(value, np.ndarray):
+                            f.write(value.tobytes())
+                        else:
+                            f.write(value)
+                        break
+        return temp_filename
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
+        # Print all available keys for debugging
         return None
 # Function to save temporary image file