Spaces:
Running
Running
Update conver.py
Browse files
conver.py
CHANGED
@@ -17,6 +17,7 @@ class ConversationConfig:
|
|
17 |
max_words: int = 3000
|
18 |
prefix_url: str = "https://r.jina.ai/"
|
19 |
model_name: str = "meta-llama/Llama-3-8b-chat-hf"
|
|
|
20 |
|
21 |
class URLToAudioConverter:
|
22 |
def __init__(self, config: ConversationConfig, llm_api_key: str):
|
@@ -56,21 +57,20 @@ class URLToAudioConverter:
|
|
56 |
if not text:
|
57 |
raise ValueError("Input text cannot be empty")
|
58 |
try:
|
59 |
-
prompt = (
|
60 |
-
f"{text}\nConvierte el texto
|
61 |
-
f"
|
62 |
-
f"
|
63 |
-
f"
|
64 |
-
'{"conversation": [{"speaker": "Anfitrión1", "text": "..."}, {"speaker": "Anfitrión2", "text": "..."}]}'
|
65 |
)
|
66 |
-
print(f"Texto de entrada: {text[:200]}...")
|
67 |
response = self.llm_client.chat.completions.create(
|
68 |
messages=[{"role": "user", "content": prompt}],
|
69 |
model=self.config.model_name,
|
70 |
response_format={"type": "json_object"}
|
71 |
)
|
72 |
response_content = response.choices[0].message.content
|
73 |
-
print(f"Respuesta cruda del modelo: {response_content[:500]}...")
|
74 |
json_str = response_content.strip()
|
75 |
if not json_str.startswith('{'):
|
76 |
json_str = json_str[json_str.find('{'):]
|
@@ -132,66 +132,91 @@ class URLToAudioConverter:
|
|
132 |
self,
|
133 |
speech_audio: AudioSegment,
|
134 |
music_path: str,
|
135 |
-
tags_paths: List[str]
|
|
|
136 |
) -> AudioSegment:
|
137 |
-
|
|
|
138 |
if len(music) < len(speech_audio):
|
139 |
music = music * ((len(speech_audio) // len(music)) + 1)
|
140 |
music = music[:len(speech_audio)]
|
141 |
mixed = speech_audio.overlay(music)
|
142 |
-
|
143 |
-
tag_intro = AudioSegment.from_file(tags_paths[0]) - 10
|
144 |
tag_trans = AudioSegment.from_file(tags_paths[1]) - 10
|
145 |
-
final_audio =
|
146 |
-
|
147 |
silent_ranges = []
|
148 |
for i in range(0, len(speech_audio) - 500, 100):
|
149 |
chunk = speech_audio[i:i+500]
|
150 |
if chunk.dBFS < -40:
|
151 |
silent_ranges.append((i, i + 500))
|
152 |
-
|
153 |
for start, end in reversed(silent_ranges):
|
154 |
if (end - start) >= len(tag_trans):
|
155 |
final_audio = final_audio.overlay(tag_trans, position=start + 50)
|
156 |
-
|
157 |
return final_audio
|
158 |
|
159 |
-
async def url_to_audio(self, url: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
|
160 |
text = self.fetch_text(url)
|
161 |
if len(words := text.split()) > self.config.max_words:
|
162 |
text = " ".join(words[:self.config.max_words])
|
163 |
conversation = self.extract_conversation(text)
|
164 |
-
return await self._process_to_audio(conversation, voice_1, voice_2)
|
165 |
|
166 |
-
async def text_to_audio(self, text: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
|
167 |
conversation = self.extract_conversation(text)
|
168 |
-
return await self._process_to_audio(conversation, voice_1, voice_2)
|
169 |
|
170 |
-
async def raw_text_to_audio(self, text: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
|
171 |
conversation = {"conversation": [{"speaker": "Anfitrión1", "text": text}]}
|
172 |
-
return await self._process_to_audio(conversation, voice_1, voice_2)
|
173 |
|
174 |
async def _process_to_audio(
|
175 |
self,
|
176 |
conversation: Dict,
|
177 |
voice_1: str,
|
178 |
-
voice_2: str
|
|
|
179 |
) -> Tuple[str, str]:
|
180 |
audio_files, folder_name = await self.text_to_speech(conversation, voice_1, voice_2)
|
181 |
combined = self.combine_audio_files(audio_files)
|
182 |
final_audio = self.add_background_music_and_tags(
|
183 |
combined,
|
184 |
"musica.mp3",
|
185 |
-
["tag.mp3", "tag2.mp3"]
|
|
|
186 |
)
|
187 |
output_path = os.path.join(folder_name, "podcast_final.mp3")
|
188 |
final_audio.export(output_path, format="mp3")
|
189 |
-
|
190 |
for f in audio_files:
|
191 |
os.remove(f)
|
192 |
-
|
193 |
text_output = "\n".join(
|
194 |
f"{turn['speaker']}: {turn['text']}"
|
195 |
for turn in conversation["conversation"]
|
196 |
)
|
197 |
return output_path, text_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
max_words: int = 3000
|
18 |
prefix_url: str = "https://r.jina.ai/"
|
19 |
model_name: str = "meta-llama/Llama-3-8b-chat-hf"
|
20 |
+
custom_prompt_template: str = None
|
21 |
|
22 |
class URLToAudioConverter:
|
23 |
def __init__(self, config: ConversationConfig, llm_api_key: str):
|
|
|
57 |
if not text:
|
58 |
raise ValueError("Input text cannot be empty")
|
59 |
try:
|
60 |
+
prompt = self.config.custom_prompt_template.format(text=text) if self.config.custom_prompt_template else (
|
61 |
+
f"{text}\nConvierte el texto en un diálogo de podcast en español entre Anfitrión1 y Anfitrión2. "
|
62 |
+
f"Genera una conversación extensa y natural con al menos 5 intercambios por hablante. "
|
63 |
+
f"Devuelve SOLO un objeto JSON: "
|
64 |
+
f'{{"conversation": [{{"speaker": "Anfitrión1", "text": "..."}}, {{"speaker": "Anfitrión2", "text": "..."}}]}}'
|
|
|
65 |
)
|
66 |
+
print(f"Texto de entrada: {text[:200]}...")
|
67 |
response = self.llm_client.chat.completions.create(
|
68 |
messages=[{"role": "user", "content": prompt}],
|
69 |
model=self.config.model_name,
|
70 |
response_format={"type": "json_object"}
|
71 |
)
|
72 |
response_content = response.choices[0].message.content
|
73 |
+
print(f"Respuesta cruda del modelo: {response_content[:500]}...")
|
74 |
json_str = response_content.strip()
|
75 |
if not json_str.startswith('{'):
|
76 |
json_str = json_str[json_str.find('{'):]
|
|
|
132 |
self,
|
133 |
speech_audio: AudioSegment,
|
134 |
music_path: str,
|
135 |
+
tags_paths: List[str],
|
136 |
+
custom_music_path: str = None
|
137 |
) -> AudioSegment:
|
138 |
+
music_file = custom_music_path if custom_music_path and os.path.exists(custom_music_path) else music_path
|
139 |
+
music = AudioSegment.from_file(music_file).fade_out(2000) - 25
|
140 |
if len(music) < len(speech_audio):
|
141 |
music = music * ((len(speech_audio) // len(music)) + 1)
|
142 |
music = music[:len(speech_audio)]
|
143 |
mixed = speech_audio.overlay(music)
|
144 |
+
tag_outro = AudioSegment.from_file(tags_paths[0]) - 10
|
|
|
145 |
tag_trans = AudioSegment.from_file(tags_paths[1]) - 10
|
146 |
+
final_audio = mixed + tag_outro
|
|
|
147 |
silent_ranges = []
|
148 |
for i in range(0, len(speech_audio) - 500, 100):
|
149 |
chunk = speech_audio[i:i+500]
|
150 |
if chunk.dBFS < -40:
|
151 |
silent_ranges.append((i, i + 500))
|
|
|
152 |
for start, end in reversed(silent_ranges):
|
153 |
if (end - start) >= len(tag_trans):
|
154 |
final_audio = final_audio.overlay(tag_trans, position=start + 50)
|
|
|
155 |
return final_audio
|
156 |
|
157 |
+
async def url_to_audio(self, url: str, voice_1: str, voice_2: str, custom_music_path: str = None) -> Tuple[str, str]:
|
158 |
text = self.fetch_text(url)
|
159 |
if len(words := text.split()) > self.config.max_words:
|
160 |
text = " ".join(words[:self.config.max_words])
|
161 |
conversation = self.extract_conversation(text)
|
162 |
+
return await self._process_to_audio(conversation, voice_1, voice_2, custom_music_path)
|
163 |
|
164 |
+
async def text_to_audio(self, text: str, voice_1: str, voice_2: str, custom_music_path: str = None) -> Tuple[str, str]:
|
165 |
conversation = self.extract_conversation(text)
|
166 |
+
return await self._process_to_audio(conversation, voice_1, voice_2, custom_music_path)
|
167 |
|
168 |
+
async def raw_text_to_audio(self, text: str, voice_1: str, voice_2: str, custom_music_path: str = None) -> Tuple[str, str]:
|
169 |
conversation = {"conversation": [{"speaker": "Anfitrión1", "text": text}]}
|
170 |
+
return await self._process_to_audio(conversation, voice_1, voice_2, custom_music_path)
|
171 |
|
172 |
async def _process_to_audio(
|
173 |
self,
|
174 |
conversation: Dict,
|
175 |
voice_1: str,
|
176 |
+
voice_2: str,
|
177 |
+
custom_music_path: str = None
|
178 |
) -> Tuple[str, str]:
|
179 |
audio_files, folder_name = await self.text_to_speech(conversation, voice_1, voice_2)
|
180 |
combined = self.combine_audio_files(audio_files)
|
181 |
final_audio = self.add_background_music_and_tags(
|
182 |
combined,
|
183 |
"musica.mp3",
|
184 |
+
["tag.mp3", "tag2.mp3"],
|
185 |
+
custom_music_path
|
186 |
)
|
187 |
output_path = os.path.join(folder_name, "podcast_final.mp3")
|
188 |
final_audio.export(output_path, format="mp3")
|
|
|
189 |
for f in audio_files:
|
190 |
os.remove(f)
|
|
|
191 |
text_output = "\n".join(
|
192 |
f"{turn['speaker']}: {turn['text']}"
|
193 |
for turn in conversation["conversation"]
|
194 |
)
|
195 |
return output_path, text_output
|
196 |
+
```
|
197 |
+
|
198 |
+
### Ejemplo de `app.py`
|
199 |
+
```python
|
200 |
+
from conver import ConversationConfig, URLToAudioConverter
|
201 |
+
import asyncio
|
202 |
+
|
203 |
+
async def main():
|
204 |
+
custom_prompt = (
|
205 |
+
"{text}\nCrea un diálogo de podcast en español entre Anfitrión1 y Anfitrión2. "
|
206 |
+
"Usa un tono informal y genera al menos 6 intercambios por hablante. "
|
207 |
+
"Devuelve SOLO un objeto JSON: {\"conversation\": [{\"speaker\": \"Anfitrión1\", \"text\": \"...\"}, {\"speaker\": \"Anfitrión2\", \"text\": \"...\"}]}"
|
208 |
+
)
|
209 |
+
config = ConversationConfig(custom_prompt_template=custom_prompt, max_words=5000)
|
210 |
+
converter = URLToAudioConverter(config, "tu_api_key")
|
211 |
+
text = "Discusión sobre inteligencia artificial y su impacto."
|
212 |
+
output_path, text_output = await converter.text_to_audio(
|
213 |
+
text,
|
214 |
+
voice_1="es-ES-AlvaroNeural",
|
215 |
+
voice_2="es-ES-ElviraNeural",
|
216 |
+
custom_music_path="mi_musica.mp3"
|
217 |
+
)
|
218 |
+
print(f"Podcast generado en: {output_path}")
|
219 |
+
print(f"Texto del diálogo:\n{text_output}")
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
asyncio.run(main())
|