File size: 7,939 Bytes
5fe16b1
b41ca3b
5fe16b1
 
587b534
5fe16b1
 
 
 
 
 
00b6284
5fe16b1
 
 
 
 
00b6284
5fe16b1
 
 
 
 
 
 
 
00b6284
5fe16b1
 
71a491a
 
 
 
 
 
 
5fe16b1
 
00b6284
71a491a
 
 
 
00b6284
 
 
71a491a
00b6284
71a491a
 
 
 
00b6284
 
71a491a
 
5fe16b1
 
00b6284
eb823dc
5fe16b1
71a491a
 
00b6284
 
 
71a491a
eb823dc
 
71a491a
00b6284
5fe16b1
00b6284
 
71a491a
00b6284
 
b41ca3b
00b6284
 
 
 
b41ca3b
885ea0a
00b6284
 
5fe16b1
 
00b6284
b41ca3b
587b534
 
5fe16b1
587b534
00b6284
71a491a
00b6284
 
b41ca3b
 
 
 
5fe16b1
00b6284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587b534
00b6284
 
 
 
587b534
00b6284
 
 
 
 
 
587b534
00b6284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf07215
00b6284
 
 
 
 
 
b41ca3b
00b6284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f9b00e
00b6284
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional
import os
import json
import httpx
from openai import OpenAI
import edge_tts
import tempfile
from pydub import AudioSegment
import base64
from pathlib import Path
import numpy as np

@dataclass
class ConversationConfig:
    max_words: int = 3000
    prefix_url: str = "https://r.jina.ai/"
    model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct"

class URLToAudioConverter:
    def __init__(self, config: ConversationConfig, llm_api_key: str):
        self.config = config
        self.llm_client = OpenAI(api_key=llm_api_key, base_url="https://api.together.xyz/v1")
        self.llm_out = None

    def fetch_text(self, url: str) -> str:
        """Obtiene texto desde una URL"""
        if not url:
            raise ValueError("URL cannot be empty")
        full_url = f"{self.config.prefix_url}{url}"
        try:
            response = httpx.get(full_url, timeout=60.0)
            response.raise_for_status()
            return response.text
        except httpx.HTTPError as e:
            raise RuntimeError(f"Failed to fetch URL: {e}")

    def extract_conversation(self, text: str) -> Dict:
        """Convierte texto plano a estructura de conversación"""
        if not text:
            raise ValueError("Input text cannot be empty")
        try:
            prompt = (
                f"{text}\nConvert this text into a podcast conversation between two hosts. "
                "Return ONLY JSON with this structure:\n"
                '{"conversation": [{"speaker": "Host1", "text": "..."}, {"speaker": "Host2", "text": "..."}]}'
            )
            response = self.llm_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.config.model_name,
                response_format={"type": "json_object"}
            )
            json_str = response.choices[0].message.content.strip()
            return json.loads(json_str[json_str.find('{'):json_str.rfind('}')+1])
        except Exception as e:
            raise RuntimeError(f"Failed to extract conversation: {str(e)}")

    async def text_to_speech(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[List[str], str]:
        """Convierte JSON de conversación a archivos de audio"""
        output_dir = Path(self._create_output_directory())
        filenames = []
        try:
            for i, turn in enumerate(conversation_json["conversation"]):
                filename = output_dir / f"segment_{i}.mp3"
                voice = voice_1 if turn["speaker"] == "Host1" else voice_2
                tmp_path = await self._generate_audio(turn["text"], voice)
                os.rename(tmp_path, filename)
                filenames.append(str(filename))
            return filenames, str(output_dir)
        except Exception as e:
            raise RuntimeError(f"Text-to-speech failed: {e}")

    async def _generate_audio(self, text: str, voice: str) -> str:
        """Genera audio temporal con edge-tts"""
        if not text.strip():
            raise ValueError("Text cannot be empty")
        
        communicate = edge_tts.Communicate(
            text,
            voice.split(" - ")[0],
            rate="+0%",
            pitch="+0Hz"
        )
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
            await communicate.save(tmp_file.name)
            return tmp_file.name

    def _create_output_directory(self) -> str:
        """Crea directorio único para los archivos"""
        folder_name = base64.urlsafe_b64encode(os.urandom(8)).decode("utf-8")
        os.makedirs(folder_name, exist_ok=True)
        return folder_name

    def combine_audio_files(self, filenames: List[str]) -> AudioSegment:
        """Combina segmentos de audio"""
        if not filenames:
            raise ValueError("No audio files provided")
        
        combined = AudioSegment.empty()
        for filename in filenames:
            combined += AudioSegment.from_file(filename, format="mp3")
        return combined

    def _detect_silences(self, audio: AudioSegment, min_len: int = 500, thresh: int = -40) -> List[Tuple[int, int]]:
        """Detecta intervalos de silencio en el audio"""
        silent_ranges = []
        start = None
        
        samples = np.array(audio.get_array_of_samples())
        window_size = int(min_len * audio.frame_rate / 1000)
        
        for i in range(0, len(samples) - window_size, window_size):
            window = samples[i:i+window_size]
            if np.max(window) < thresh:
                if start is None:
                    start = i
            else:
                if start is not None:
                    silent_ranges.append((start, i))
                    start = None
        
        return silent_ranges

    def add_background_music_and_tags(
        self,
        speech_audio: AudioSegment,
        music_path: str,
        tags_paths: List[str]
    ) -> AudioSegment:
        """Mezcla música de fondo y tags inteligentemente"""
        # 1. Cargar y ajustar música
        music = AudioSegment.from_file(music_path).fade_out(2000)
        music = music - 25  # Reducir volumen
        
        # 2. Loop inteligente (solo si es necesario)
        if len(music) < len(speech_audio):
            loops = (len(speech_audio) // len(music)) + 1
            music = music * loops
        music = music[:len(speech_audio)]
        
        # 3. Mezclar voz y música
        mixed = speech_audio.overlay(music, position=0)
        
        # 4. Insertar tags
        tag_intro = AudioSegment.from_file(tags_paths[0]) - 10
        tag_transition = AudioSegment.from_file(tags_paths[1]) - 10
        
        # Tag inicial
        final_audio = tag_intro + mixed
        
        # Tags en pausas (opcional)
        silences = self._detect_silences(speech_audio)
        for start, end in reversed(silences):
            if (end - start) > len(tag_transition):
                final_audio = final_audio.overlay(
                    tag_transition,
                    position=start + 100  # Pequeño margen
                )
        
        return final_audio

    async def process_content(
        self,
        content: str,
        voice_1: str,
        voice_2: str,
        is_url: bool = False
    ) -> Tuple[str, str]:
        """Procesa contenido (URL o texto) a audio final"""
        try:
            # 1. Obtener texto estructurado
            if is_url:
                text = self.fetch_text(content)
                if len(words := text.split()) > self.config.max_words:
                    text = " ".join(words[:self.config.max_words])
                conversation = self.extract_conversation(text)
            else:
                conversation = self.extract_conversation(content)
            
            # 2. Generar audio
            audio_files, folder_name = await self.text_to_speech(conversation, voice_1, voice_2)
            combined = self.combine_audio_files(audio_files)
            
            # 3. Mezclar con música y tags
            final_audio = self.add_background_music_and_tags(
                combined,
                "musica.mp3",
                ["tag.mp3", "tag2.mp3"]
            )
            
            # 4. Exportar
            output_path = os.path.join(folder_name, "podcast_final.mp3")
            final_audio.export(output_path, format="mp3")
            
            # 5. Limpieza
            for f in audio_files:
                os.remove(f)
            
            # Texto de conversación
            conversation_text = "\n".join(
                f"{turn['speaker']}: {turn['text']}" 
                for turn in conversation["conversation"]
            )
            
            return output_path, conversation_text
            
        except Exception as e:
            raise RuntimeError(f"Processing failed: {str(e)}")