cnph001 commited on
Commit
284179e
·
verified ·
1 Parent(s): 552e1db

Restore working

Browse files
Files changed (1) hide show
  1. app.py +32 -74
app.py CHANGED
@@ -5,33 +5,12 @@ import asyncio
5
  import tempfile
6
  import os
7
  import re # Import the regular expression module
8
- import struct
9
- import wave
10
-
11
- # Function to create a temporary silent WAV file
12
- def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
13
- """Creates a temporary WAV file containing silence.
14
-
15
- Args:
16
- duration (float): Duration of silence in seconds.
17
- temp_dir (str): Directory to save the temporary file.
18
- sample_rate (int): Sample rate of the audio (samples per second).
19
- num_channels (int): Number of audio channels (1 for mono, 2 for stereo).
20
- sample_width (int): Sample width in bytes (e.g., 2 for 16-bit).
21
-
22
- Returns:
23
- str: Path to the temporary silent WAV file.
24
- """
25
- num_frames = int(duration * sample_rate)
26
- silent_data = b'\x00' * (num_frames * num_channels * sample_width)
27
 
28
- temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
29
- with wave.open(temp_wav_path, 'w') as wf:
30
- wf.setnchannels(num_channels)
31
- wf.setframerate(sample_rate)
32
- wf.setsampwidth(sample_width)
33
- wf.writeframes(silent_data)
34
- return temp_wav_path
35
 
36
  # Text-to-speech function for a single paragraph with SS handling
37
  async def paragraph_to_speech(text, voice, rate, pitch):
@@ -48,16 +27,15 @@ async def paragraph_to_speech(text, voice, rate, pitch):
48
  return None, [] # Return None for audio path and empty list for silence
49
 
50
  audio_segments = []
51
- temp_dir = tempfile.gettempdir()
52
  parts = re.split(r'(SS\d+\.?\d*)', text)
53
 
54
  for part in parts:
55
  if re.match(r'SS\d+\.?\d*', part):
56
  try:
57
  silence_duration = float(part[2:])
58
- # Assuming default WAV parameters for silence
59
- silent_wav_path = create_silent_wav(silence_duration, temp_dir)
60
- audio_segments.append(silent_wav_path)
61
  except ValueError:
62
  print(f"Warning: Invalid silence duration format: {part}")
63
  elif part.strip():
@@ -93,19 +71,21 @@ async def paragraph_to_speech(text, voice, rate, pitch):
93
  current_pitch = -30
94
  current_rate = -20
95
  else:
 
 
96
  current_voice = (voice or default_voice).split(" - ")[0]
97
  processed_text=part[:]
98
  rate_str = f"{current_rate:+d}%"
99
  pitch_str = f"{current_pitch:+d}Hz"
100
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
101
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
102
  tmp_path = tmp_file.name
103
  await communicate.save(tmp_path)
104
  audio_segments.append(tmp_path)
105
  else:
106
  audio_segments.append(None) # Empty string
107
 
108
- return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
109
 
110
  # Main text-to-speech function that processes paragraphs and silence
111
  async def text_to_speech(text, voice, rate, pitch):
@@ -118,9 +98,12 @@ async def text_to_speech(text, voice, rate, pitch):
118
  final_audio_segments = []
119
 
120
  for paragraph in paragraphs:
121
- audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
122
  if audio_paths:
123
- final_audio_segments.extend(audio_paths)
 
 
 
124
 
125
  if not any(isinstance(item, str) for item in final_audio_segments):
126
  return None, None # No actual audio generated
@@ -128,42 +111,20 @@ async def text_to_speech(text, voice, rate, pitch):
128
  if all(not isinstance(item, str) for item in final_audio_segments):
129
  return None, "Only silence markers found."
130
 
131
- combined_audio_path = tempfile.mktemp(suffix=".wav")
132
- with wave.open(combined_audio_path, 'w') as outfile:
133
- first_audio = True
134
- sample_rate = None
135
- num_channels = None
136
- sample_width = None
137
-
138
- for segment_path in final_audio_segments:
139
- if isinstance(segment_path, str):
140
  try:
141
- with wave.open(segment_path, 'rb') as infile:
142
- current_num_channels = infile.getnchannels()
143
- current_sample_rate = infile.getframerate()
144
- current_sample_width = infile.getsampwidth()
145
- frames = infile.readframes(infile.getnframes())
146
-
147
- if first_audio:
148
- num_channels = current_num_channels
149
- sample_rate = current_sample_rate
150
- sample_width = current_sample_width
151
- outfile.setnchannels(num_channels)
152
- outfile.setframerate(sample_rate)
153
- outfile.setsampwidth(sample_width)
154
- first_audio = False
155
- elif (current_num_channels != num_channels or
156
- current_sample_rate != sample_rate or
157
- current_sample_width != sample_width):
158
- print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
159
- continue
160
-
161
- outfile.writeframes(frames)
162
- os.remove(segment_path) # Clean up individual files
163
- except wave.Error as e:
164
- print(f"Warning: Error reading WAV file {segment_path}: {e}")
165
  except FileNotFoundError:
166
- print(f"Warning: Audio file not found: {segment_path}")
 
 
 
 
167
 
168
  return combined_audio_path, None
169
 
@@ -173,12 +134,9 @@ def tts_interface(text, voice, rate, pitch):
173
  audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
174
  return audio, warning
175
 
176
- async def get_voices():
177
- voices_list = await edge_tts.list_voices()
178
- voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
179
- return voices_dict
180
-
181
  # Create Gradio application
 
 
182
  async def create_demo():
183
  voices = await get_voices()
184
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices
@@ -201,7 +159,7 @@ async def create_demo():
201
  gr.Audio(label="Generated Audio", type="filepath"),
202
  gr.Markdown(label="Warning", visible=False)
203
  ],
204
- title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
205
  description=description,
206
  article="Process text paragraph by paragraph for smoother output and insert silence markers.",
207
  analytics_enabled=False,
 
5
  import tempfile
6
  import os
7
  import re # Import the regular expression module
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+
10
+ # Get all available voices
11
+ async def get_voices():
12
+ voices = await edge_tts.list_voices()
13
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
 
14
 
15
  # Text-to-speech function for a single paragraph with SS handling
16
  async def paragraph_to_speech(text, voice, rate, pitch):
 
27
  return None, [] # Return None for audio path and empty list for silence
28
 
29
  audio_segments = []
30
+ silence_durations = []
31
  parts = re.split(r'(SS\d+\.?\d*)', text)
32
 
33
  for part in parts:
34
  if re.match(r'SS\d+\.?\d*', part):
35
  try:
36
  silence_duration = float(part[2:])
37
+ silence_durations.append(silence_duration)
38
+ audio_segments.append(None) # Placeholder for silence
 
39
  except ValueError:
40
  print(f"Warning: Invalid silence duration format: {part}")
41
  elif part.strip():
 
71
  current_pitch = -30
72
  current_rate = -20
73
  else:
74
+ # Use selected voice, or fallback to default
75
+ #voice_short_name = (voice or default_voice).split(" - ")[0]
76
  current_voice = (voice or default_voice).split(" - ")[0]
77
  processed_text=part[:]
78
  rate_str = f"{current_rate:+d}%"
79
  pitch_str = f"{current_pitch:+d}Hz"
80
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
81
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
82
  tmp_path = tmp_file.name
83
  await communicate.save(tmp_path)
84
  audio_segments.append(tmp_path)
85
  else:
86
  audio_segments.append(None) # Empty string
87
 
88
+ return audio_segments, silence_durations
89
 
90
  # Main text-to-speech function that processes paragraphs and silence
91
  async def text_to_speech(text, voice, rate, pitch):
 
98
  final_audio_segments = []
99
 
100
  for paragraph in paragraphs:
101
+ audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
102
  if audio_paths:
103
+ for i, path in enumerate(audio_paths):
104
+ final_audio_segments.append(path)
105
+ if i < len(silence_times):
106
+ final_audio_segments.append(silence_times[i])
107
 
108
  if not any(isinstance(item, str) for item in final_audio_segments):
109
  return None, None # No actual audio generated
 
111
  if all(not isinstance(item, str) for item in final_audio_segments):
112
  return None, "Only silence markers found."
113
 
114
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
115
+ with open(combined_audio_path, 'wb') as outfile:
116
+ for segment in final_audio_segments:
117
+ if isinstance(segment, str):
 
 
 
 
 
118
  try:
119
+ with open(segment, 'rb') as infile:
120
+ outfile.write(infile.read())
121
+ os.remove(segment) # Clean up individual files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  except FileNotFoundError:
123
+ print(f"Warning: Audio file not found: {segment}")
124
+ elif isinstance(segment, (int, float)):
125
+ # Basic silence insertion (approximate)
126
+ silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
127
+ outfile.write(silence)
128
 
129
  return combined_audio_path, None
130
 
 
134
  audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
135
  return audio, warning
136
 
 
 
 
 
 
137
  # Create Gradio application
138
+ import gradio as gr
139
+
140
  async def create_demo():
141
  voices = await get_voices()
142
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices
 
159
  gr.Audio(label="Generated Audio", type="filepath"),
160
  gr.Markdown(label="Warning", visible=False)
161
  ],
162
+ title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
163
  description=description,
164
  article="Process text paragraph by paragraph for smoother output and insert silence markers.",
165
  analytics_enabled=False,