CR7CAD commited on
Commit
7c4bc18
·
verified ·
1 Parent(s): ad4186a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -23
app.py CHANGED
@@ -9,14 +9,15 @@ def img2text(image):
9
  text = image_to_text(image)[0]["generated_text"]
10
  return text
11
 
12
- # Simple text-to-story function
13
  def text2story(text):
14
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
15
- prompt = f"Write a short children's story based on this: {text}. Once upon a time, "
16
 
 
17
  story_result = generator(
18
  prompt,
19
- max_length=150,
20
  num_return_sequences=1,
21
  temperature=0.7,
22
  do_sample=True
@@ -24,12 +25,60 @@ def text2story(text):
24
 
25
  story_text = story_result[0]['generated_text']
26
  story_text = story_text.replace(prompt, "Once upon a time, ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  return story_text
28
 
29
- # Simple text-to-audio function
30
  def text2audio(story_text):
31
- synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
32
- speech = synthesizer(story_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  return speech
34
 
35
  # Basic Streamlit interface
@@ -44,26 +93,31 @@ if uploaded_file is not None:
44
  image = Image.open(uploaded_file)
45
 
46
  # Image to Text
47
- st.write("Generating caption...")
48
- caption = img2text(image)
49
  st.write(f"Caption: {caption}")
50
 
51
  # Text to Story
52
- st.write("Creating story...")
53
- story = text2story(caption)
54
  st.write(f"Story: {story}")
55
 
56
  # Text to Audio
57
- st.write("Generating audio...")
58
- speech_output = text2audio(story)
59
-
60
- # Play audio
61
- try:
62
- if 'audio' in speech_output and 'sampling_rate' in speech_output:
63
- st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
64
- elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
65
- st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
66
- else:
67
- st.write("Audio generated but could not be played.")
68
- except Exception as e:
69
- st.error(f"Error playing audio: {e}")
 
 
 
 
 
 
9
  text = image_to_text(image)[0]["generated_text"]
10
  return text
11
 
12
+ # Improved text-to-story function with natural ending
13
  def text2story(text):
14
  generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
15
+ prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
16
 
17
+ # Generate a longer text to ensure we get a complete story
18
  story_result = generator(
19
  prompt,
20
+ max_length=300,
21
  num_return_sequences=1,
22
  temperature=0.7,
23
  do_sample=True
 
25
 
26
  story_text = story_result[0]['generated_text']
27
  story_text = story_text.replace(prompt, "Once upon a time, ")
28
+
29
+ # Find natural ending points (end of sentences)
30
+ periods = [i for i, char in enumerate(story_text) if char == '.']
31
+ question_marks = [i for i, char in enumerate(story_text) if char == '?']
32
+ exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
33
+
34
+ # Combine all ending punctuation and sort
35
+ all_endings = sorted(periods + question_marks + exclamation_marks)
36
+
37
+ # If we have any sentence endings
38
+ if all_endings:
39
+ # Get the index where the story should reasonably end (after at least 100 characters)
40
+ min_story_length = 100
41
+ suitable_endings = [i for i in all_endings if i >= min_story_length]
42
+
43
+ if suitable_endings:
44
+ # Find an ending that completes a thought (not just the first sentence)
45
+ if len(suitable_endings) > 2:
46
+ # Use the third sentence ending or later for a more complete story
47
+ return story_text[:suitable_endings[2]+1]
48
+ else:
49
+ # If we don't have many sentences, use the last one we found
50
+ return story_text[:suitable_endings[-1]+1]
51
+
52
+ # If no good ending is found, return as is
53
  return story_text
54
 
55
+ # Updated text-to-audio function with a compatible model
56
  def text2audio(story_text):
57
+ # Use Microsoft's SpeechT5 model which is widely supported
58
+ synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
59
+
60
+ # This model requires speaker embeddings
61
+ from transformers import SpeechT5HifiGan
62
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
63
+
64
+ # Get speaker embeddings for a female voice
65
+ from transformers import SpeechT5Processor
66
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
67
+ speaker_embeddings = processor.speaker_embeddings["female"]
68
+
69
+ # Limit text length to avoid issues
70
+ max_chars = 500
71
+ if len(story_text) > max_chars:
72
+ last_period = story_text[:max_chars].rfind('.')
73
+ if last_period > 0:
74
+ story_text = story_text[:last_period + 1]
75
+ else:
76
+ story_text = story_text[:max_chars]
77
+
78
+ # Generate speech with appropriate parameters
79
+ inputs = processor(text=story_text, return_tensors="pt")
80
+ speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
81
+
82
  return speech
83
 
84
  # Basic Streamlit interface
 
93
  image = Image.open(uploaded_file)
94
 
95
  # Image to Text
96
+ with st.spinner("Generating caption..."):
97
+ caption = img2text(image)
98
  st.write(f"Caption: {caption}")
99
 
100
  # Text to Story
101
+ with st.spinner("Creating story..."):
102
+ story = text2story(caption)
103
  st.write(f"Story: {story}")
104
 
105
  # Text to Audio
106
+ with st.spinner("Generating audio..."):
107
+ try:
108
+ speech_output = text2audio(story)
109
+
110
+ # Play audio
111
+ if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'):
112
+ if hasattr(speech_output, 'numpy'):
113
+ audio_data = speech_output.numpy()
114
+ else:
115
+ audio_data = speech_output.audio
116
+
117
+ sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000
118
+ st.audio(audio_data, sample_rate=sample_rate)
119
+ else:
120
+ st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000))
121
+ except Exception as e:
122
+ st.error(f"Error generating or playing audio: {e}")
123
+ st.write("Try installing the latest transformers library with: pip install --upgrade transformers")