File size: 5,744 Bytes
fd43dfa
039f896
fd43dfa
1bb4376
fd43dfa
 
fe32dd7
039f896
fd43dfa
039f896
fe32dd7
 
 
fd43dfa
90d12be
fe32dd7
90d12be
 
 
 
fe32dd7
 
 
 
 
 
 
1bb4376
90d12be
 
 
 
 
 
 
 
 
fe32dd7
 
 
 
 
1bb4376
59b69bc
 
 
 
90d12be
59b69bc
1bb4376
 
 
 
 
 
 
 
7e8954f
1bb4376
 
 
 
 
 
7e8954f
 
 
 
 
 
 
 
 
 
1bb4376
 
59b69bc
 
 
 
 
 
 
1bb4376
fe32dd7
 
90d12be
fe32dd7
59b69bc
 
 
 
1bb4376
 
 
 
 
 
de6323e
 
 
 
 
 
 
90d12be
 
7e8954f
90d12be
1bb4376
 
 
 
 
 
 
de6323e
 
 
 
 
1bb4376
de6323e
1bb4376
 
 
 
 
 
de6323e
 
fd43dfa
90d12be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
import os
import asyncio
import json
from conver import ConversationConfig, URLToAudioConverter
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()

MUSICA_FONDO = "musica.mp3"
TAG1 = "tag.mp3"
TAG2 = "tag2.mp3"

def mezclar_musica_y_tags(audio_path: str, custom_music_path: str = None) -> str:
    podcast_audio = AudioSegment.from_file(audio_path)
    music_file = custom_music_path if custom_music_path and os.path.exists(custom_music_path) else MUSICA_FONDO
    musica_fondo = AudioSegment.from_file(music_file).apply_gain(-15)
    tag_outro = AudioSegment.from_file(TAG1).apply_gain(-5)
    tag_trans = AudioSegment.from_file(TAG2).apply_gain(-5)

    duracion_podcast = len(podcast_audio)
    repeticiones = (duracion_podcast // len(musica_fondo)) + 1
    musica_fondo_loop = musica_fondo * repeticiones
    musica_fondo_loop = musica_fondo_loop[:duracion_podcast]

    mezcla = musica_fondo_loop.overlay(podcast_audio)
    mezcla = mezcla + tag_outro

    silent_ranges = []
    for i in range(0, len(podcast_audio) - 500, 100):
        chunk = podcast_audio[i:i+500]
        if chunk.dBFS < -40:
            silent_ranges.append((i, i + 500))
    for start, end in reversed(silent_ranges):
        if (end - start) >= len(tag_trans):
            mezcla = mezcla.overlay(tag_trans, position=start + 50)

    output_path = audio_path.replace(".mp3", "_con_musica.mp3")
    mezcla.export(output_path, format="mp3")
    return output_path

async def generate_dialogue(article_url, text_input, language, skip_llm, custom_prompt):
    if not article_url and not text_input:
        return "Error: Ingresa una URL o texto", None

    try:
        config = ConversationConfig(custom_prompt_template=custom_prompt)
        converter = URLToAudioConverter(config, llm_api_key=os.environ.get("TOGETHER_API_KEY"))

        if skip_llm and text_input:
            dialogue = {"conversation": [{"speaker": "Anfitrión1", "text": text_input}]}
        elif text_input:
            dialogue = converter.extract_conversation(text_input)
        else:
            dialogue = converter.extract_conversation(await converter.fetch_text(article_url))

        # Usar comillas dobles al mostrar el JSON
        return json.dumps(dialogue, indent=2, ensure_ascii=False), dialogue
    except Exception as e:
        return f"Error: {str(e)}", None

async def generate_audio(dialogue_json, language, agregar_musica, custom_music):
    try:
        # Validar JSON antes de parsear
        json_str = dialogue_json.strip()
        try:
            dialogue = json.loads(json_str)
        except json.JSONDecodeError as e:
            return f"Error: JSON inválido - {str(e)}", None

        if not dialogue.get("conversation"):
            return "Error: El JSON no contiene 'conversation'", None

        config = ConversationConfig()
        converter = URLToAudioConverter(config, llm_api_key=os.environ.get("TOGETHER_API_KEY"))
        
        voices = {
            "en": ("en-US-AvaMultilingualNeural", "en-US-AndrewMultilingualNeural"),
            "es": ("es-ES-AlvaroNeural", "es-ES-ElviraNeural")
        }
        voice1, voice2 = voices.get(language, voices["en"])

        output_file, conversation = await converter._process_to_audio(dialogue, voice1, voice2, custom_music)

        if agregar_musica:
            output_file = mezclar_musica_y_tags(output_file, custom_music)

        return conversation, output_file
    except Exception as e:
        return f"Error: {str(e)}", None

def synthesize_sync(article_url, text_input, language, skip_llm, custom_prompt):
    return asyncio.run(generate_dialogue(article_url, text_input, language, skip_llm, custom_prompt))

def generate_audio_sync(dialogue_json, language, agregar_musica, custom_music):
    return asyncio.run(generate_audio(dialogue_json, language, agregar_musica, custom_music))

with gr.Blocks(theme='gstaff/sketch') as demo:
    gr.Markdown("# 🎙 Podcast Converter")
    with gr.Group():
        text_url = gr.Textbox(label="URL (opcional)", placeholder="https://...")
        text_input = gr.Textbox(label="Texto manual", lines=5, placeholder="Pega tu texto aquí...")
        language = gr.Dropdown(["en", "es"], label="Idioma", value="en")
        skip_llm = gr.Checkbox(label="🔴 Modo libre (sin filtros LLM)", value=False)
        custom_prompt = gr.Textbox(
            label="Prompt personalizado (opcional)",
            placeholder='{text}\nCrea un diálogo de podcast en español entre Anfitrión1 y Anfitrión2. Usa un tono informal y genera al menos 6 intercambios por hablante. Devuelve SOLO un objeto JSON: {"conversation": [{"speaker": "Anfitrión1", "text": "..."}, {"speaker": "Anfitrión2", "text": "..."}]}'
        )
        btn_dialogue = gr.Button("Generar Diálogo", variant="primary")
    
    with gr.Group():
        dialogue_json = gr.Textbox(label="Diálogo JSON (editable)", lines=10, interactive=True)
        agregar_musica = gr.Checkbox(label="🎵 Agregar música de fondo y cortinillas", value=False)
        custom_music = gr.File(label="Subir música de fondo (opcional)", file_types=[".mp3"])
        btn_audio = gr.Button("Generar Audio", variant="primary")
    
    with gr.Row():
        conv_display = gr.Textbox(label="Conversación", interactive=False, lines=10)
        aud = gr.Audio(label="Audio Generado", interactive=False)
    
    btn_dialogue.click(
        synthesize_sync,
        inputs=[text_url, text_input, language, skip_llm, custom_prompt],
        outputs=[dialogue_json, dialogue_json]
    )
    btn_audio.click(
        generate_audio_sync,
        inputs=[dialogue_json, language, agregar_musica, custom_music],
        outputs=[conv_display, aud]
    )

demo.launch()