CR7CAD commited on
Commit
efe4c0f
·
verified ·
1 Parent(s): 3b82d8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -51
app.py CHANGED
@@ -1,8 +1,7 @@
1
- # Imports
2
  import streamlit as st
3
  from transformers import pipeline
4
  from PIL import Image
5
- import torch
6
 
7
  # Simple image-to-text function
8
  def img2text(image):
@@ -10,7 +9,7 @@ def img2text(image):
10
  text = image_to_text(image)[0]["generated_text"]
11
  return text
12
 
13
- # Improved text-to-story function with natural ending
14
  def text2story(text):
15
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
16
  prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
@@ -53,33 +52,10 @@ def text2story(text):
53
  # If no good ending is found, return as is
54
  return story_text
55
 
56
- # Updated text-to-audio function with a compatible model
57
  def text2audio(story_text):
58
- # Use Microsoft's SpeechT5 model which is widely supported
59
- synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
60
-
61
- # This model requires speaker embeddings
62
- from transformers import SpeechT5HifiGan
63
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
64
-
65
- # Get speaker embeddings for a female voice
66
- from transformers import SpeechT5Processor
67
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
68
- speaker_embeddings = processor.speaker_embeddings["female"]
69
-
70
- # Limit text length to avoid issues
71
- max_chars = 500
72
- if len(story_text) > max_chars:
73
- last_period = story_text[:max_chars].rfind('.')
74
- if last_period > 0:
75
- story_text = story_text[:last_period + 1]
76
- else:
77
- story_text = story_text[:max_chars]
78
-
79
- # Generate speech with appropriate parameters
80
- inputs = processor(text=story_text, return_tensors="pt")
81
- speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
82
-
83
  return speech
84
 
85
  # Basic Streamlit interface
@@ -94,31 +70,26 @@ if uploaded_file is not None:
94
  image = Image.open(uploaded_file)
95
 
96
  # Image to Text
97
- with st.spinner("Generating caption..."):
98
- caption = img2text(image)
99
  st.write(f"Caption: {caption}")
100
 
101
  # Text to Story
102
- with st.spinner("Creating story..."):
103
- story = text2story(caption)
104
  st.write(f"Story: {story}")
105
 
106
  # Text to Audio
107
- with st.spinner("Generating audio..."):
108
- try:
109
- speech_output = text2audio(story)
110
-
111
- # Play audio
112
- if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'):
113
- if hasattr(speech_output, 'numpy'):
114
- audio_data = speech_output.numpy()
115
- else:
116
- audio_data = speech_output.audio
117
-
118
- sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000
119
- st.audio(audio_data, sample_rate=sample_rate)
120
- else:
121
- st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000))
122
- except Exception as e:
123
- st.error(f"Error generating or playing audio: {e}")
124
- st.write("Try installing the latest transformers library with: pip install --upgrade transformers")
 
1
+ # Only the two imports you requested
2
  import streamlit as st
3
  from transformers import pipeline
4
  from PIL import Image
 
5
 
6
  # Simple image-to-text function
7
  def img2text(image):
 
9
  text = image_to_text(image)[0]["generated_text"]
10
  return text
11
 
12
+ # Simple text-to-story function
13
  def text2story(text):
14
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
15
  prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
 
52
  # If no good ending is found, return as is
53
  return story_text
54
 
55
+ # Simple text-to-audio function
56
  def text2audio(story_text):
57
+ synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
58
+ speech = synthesizer(story_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  return speech
60
 
61
  # Basic Streamlit interface
 
70
  image = Image.open(uploaded_file)
71
 
72
  # Image to Text
73
+ st.write("Generating caption...")
74
+ caption = img2text(image)
75
  st.write(f"Caption: {caption}")
76
 
77
  # Text to Story
78
+ st.write("Creating story...")
79
+ story = text2story(caption)
80
  st.write(f"Story: {story}")
81
 
82
  # Text to Audio
83
+ st.write("Generating audio...")
84
+ speech_output = text2audio(story)
85
+
86
+ # Play audio
87
+ try:
88
+ if 'audio' in speech_output and 'sampling_rate' in speech_output:
89
+ st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
90
+ elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
91
+ st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
92
+ else:
93
+ st.write("Audio generated but could not be played.")
94
+ except Exception as e:
95
+ st.error(f"Error playing audio: {e}")