udayl commited on
Commit
b65edab
·
1 Parent(s): 32d1493
Files changed (2) hide show
  1. gradio_app.py +118 -209
  2. notebook_lm_kokoro.py +106 -52
gradio_app.py CHANGED
@@ -1,263 +1,172 @@
1
- # filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py
2
  import os
3
  import tempfile
4
  import gradio as gr
5
- from notebook_lm_kokoro import generate_podcast_script, KPipeline
6
- import soundfile as sf
7
- import numpy as np
8
- import ast
9
  import shutil
 
 
 
10
  import warnings
11
- import os
12
- import gradio as gr
13
- import concurrent.futures
14
- import multiprocessing
15
- from notebook_lm_kokoro import generate_podcast_script, generate_audio_from_script
16
- warnings.filterwarnings("ignore")
17
 
18
- # Define number of workers based on CPU cores
19
- NUM_WORKERS = multiprocessing.cpu_count() # Gets total CPU cores
 
 
 
 
 
 
 
 
 
 
20
 
21
- def process_segment(entry_and_voice_map):
22
- entry, voice_map = entry_and_voice_map # Unpack the tuple
 
23
  speaker, dialogue = entry
24
  chosen_voice = voice_map.get(speaker, "af_heart")
25
- print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
26
-
27
  pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
28
  generator = pipeline(dialogue, voice=chosen_voice)
29
-
30
- segment_audio = []
31
- for _, _, audio in generator:
32
- segment_audio.append(audio)
33
-
34
- if segment_audio:
35
- return np.concatenate(segment_audio, axis=0)
36
- return None
37
 
38
  def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
39
- voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
40
-
41
- # Clean up the script string if needed
42
- script = script.strip()
43
- if not script.startswith("[") or not script.endswith("]"):
44
- print("Invalid transcript format. Expected a list of tuples.")
45
- return None
46
 
 
47
  try:
48
  transcript_list = ast.literal_eval(script)
49
  if not isinstance(transcript_list, list):
50
  raise ValueError("Transcript is not a list")
51
 
52
- all_audio_segments = []
53
- # Prepare input data with voice_map for each entry
54
- entries_with_voice_map = [(entry, voice_map) for entry in transcript_list]
55
 
56
- try:
57
- # Process segments in parallel
58
- with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
59
- # Map the processing function across all dialogue entries
60
- results = list(executor.map(process_segment, entries_with_voice_map))
61
-
62
- # Filter out None results and combine audio segments
63
- all_audio_segments = [r for r in results if r is not None]
64
-
65
- except Exception as e:
66
- print(f"Error during audio generation: {e}")
67
  return None
68
-
69
- if not all_audio_segments:
70
- print("No audio segments were generated")
71
- return None
72
-
73
- # Add a pause between segments
74
  sample_rate = 24000
75
  pause = np.zeros(sample_rate, dtype=np.float32)
76
- final_audio = all_audio_segments[0]
77
- for seg in all_audio_segments[1:]:
78
  final_audio = np.concatenate((final_audio, pause, seg), axis=0)
79
-
80
  sf.write(output_file, final_audio, sample_rate)
81
- print(f"Saved final audio as {output_file}")
82
  return output_file
83
-
84
  except Exception as e:
85
- print(f"Error processing transcript: {e}")
86
  return None
87
 
88
-
89
- def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider, api_key, openrouter_base=None):
90
- """Process the uploaded PDF file and generate audio"""
91
  try:
92
- # Set API configuration based on provider
93
- os.environ["OPENAI_API_KEY"] = api_key
94
- if provider == "openai":
 
 
 
 
95
  os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
96
- else:
 
97
  os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
98
 
99
- # Check if file is uploaded
100
  if pdf_file is None:
101
  return "No file uploaded", None
102
 
103
- # Use /tmp if writable, else fallback to current directory
104
- base_dir = "/tmp" if os.access("/tmp", os.W_OK) else os.getcwd()
105
 
106
- # Save uploaded PDF to temp location
107
- tmp_path = os.path.join(base_dir, f"uploaded_{os.path.basename(pdf_file.name)}")
108
- shutil.copy2(pdf_file.name, tmp_path)
109
- print(f"[INFO] Uploaded PDF saved at {tmp_path}")
110
 
111
- # Generate podcast script
112
- transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider)
113
  if transcript is None:
114
- return "Error generating transcript", None
115
-
116
- # Define output file path
117
- audio_output_path = os.path.join(
118
- os.path.dirname(tmp_path),
119
- f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}"
120
- )
121
-
122
- # Generate audio using ProcessPoolExecutor
123
- with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
124
- print(f"[INFO] Processing audio with {NUM_WORKERS} CPU cores")
125
- future = executor.submit(
126
- generate_audio_from_script_with_voices,
127
- transcript, speaker1_voice, speaker2_voice, audio_output_path
128
- )
129
- result = future.result()
130
 
131
- if result is None:
132
- return "Error generating audio", None
133
 
134
- return "Process complete!", result
 
 
 
135
 
 
136
  except Exception as e:
137
- print(f"[ERROR] process_pdf failed: {str(e)}")
138
- return f"Error processing file: {str(e)}", None
139
-
 
 
 
 
 
 
 
 
 
 
140
 
141
  def create_gradio_app():
142
- # Add CSS for better styling
143
- css = """
144
- .gradio-container {max-width: 900px !important}
145
- """
146
-
147
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
148
- gr.Markdown(
149
- """
150
- # 📚 NotebookLM-Kokoro TTS App
151
- Upload a PDF, choose voices, and generate conversational audio using Kokoro TTS.
152
- """
153
- )
154
-
155
  with gr.Row():
156
- with gr.Column(scale=2):
157
- pdf_input = gr.File(
158
- label="Upload PDF Document",
159
- file_types=[".pdf"],
160
- type="filepath"
161
- )
162
-
163
- with gr.Row():
164
- speaker1_voice = gr.Dropdown(
165
- choices=["af_heart", "af_bella", "hf_beta"],
166
- value="af_heart",
167
- label="Speaker 1 Voice"
168
- )
169
- speaker2_voice = gr.Dropdown(
170
- choices=["af_nicole", "af_heart", "bf_emma"],
171
- value="bf_emma",
172
- label="Speaker 2 Voice"
173
- )
174
-
175
 
176
- with gr.Group():
177
- provider = gr.Radio(
178
- choices=["openai", "openrouter"],
179
- value="openrouter",
180
- label="API Provider"
181
- )
182
-
183
- api_key = gr.Textbox(
184
- label="API Key",
185
- placeholder="Enter your API key here...",
186
- type="password",
187
- elem_classes="api-input"
188
- )
189
-
190
- openrouter_base = gr.Textbox(
191
- label="OpenRouter Base URL (optional)",
192
- placeholder="https://openrouter.ai/api/v1",
193
- visible=False,
194
- elem_classes="api-input"
195
- )
 
 
196
 
197
- # Show/hide OpenRouter base URL based on provider selection
198
- def toggle_openrouter_base(provider_choice):
199
- return gr.update(visible=provider_choice == "openrouter")
200
-
201
- provider.change(
202
- fn=toggle_openrouter_base,
203
- inputs=[provider],
204
- outputs=[openrouter_base]
205
- )
206
-
207
- submit_btn = gr.Button("🎙️ Generate Audio", variant="primary")
208
-
209
- with gr.Column(scale=2):
210
- status_output = gr.Textbox(
211
- label="Status",
212
- placeholder="Processing status will appear here..."
213
- )
214
- audio_output = gr.Audio(
215
- label="Generated Audio",
216
- type="filepath"
217
- )
218
-
219
- # # Examples section
220
- # gr.Examples(
221
- # examples=[
222
- # ["sample.pdf", "af_heart", "af_nicole", "openrouter", "your-api-key-here", "https://openrouter.ai/api/v1"],
223
- # ],
224
- # inputs=[pdf_input, speaker1_voice, speaker2_voice, provider, api_key, openrouter_base],
225
- # outputs=[status_output, audio_output],
226
- # fn=process_pdf,
227
- # cache_examples=True,
228
- # )
229
-
230
  submit_btn.click(
231
- fn=process_pdf,
232
- inputs=[
233
- pdf_input,
234
- speaker1_voice,
235
- speaker2_voice,
236
- provider,
237
- api_key,
238
- openrouter_base
239
- ],
240
- outputs=[status_output, audio_output],
241
- api_name="generate"
242
  )
243
-
244
- gr.Markdown(
245
- """
246
- ### 📝 Notes
247
- - Make sure your PDF is readable and contains text (not scanned images)
248
- - Processing large PDFs may take a few minutes
249
- - You need a valid OpenAI/OpenRouter API key set as environment variable
250
- """
251
- )
252
-
 
 
 
 
 
253
  return app
254
 
255
  if __name__ == "__main__":
256
- demo = create_gradio_app()
257
- demo.queue().launch(
258
- server_name="0.0.0.0",
259
- server_port=7860,
260
- share=True,
261
- debug=True,
262
- pwa=True
263
- )
 
 
1
  import os
2
  import tempfile
3
  import gradio as gr
 
 
 
 
4
  import shutil
5
+ import ast
6
+ import numpy as np
7
+ import soundfile as sf
8
  import warnings
 
 
 
 
 
 
9
 
10
+ try:
11
+ from moshi.models.tts import TTSModel
12
+ except ImportError:
13
+ print("Moshi TTSModel not available — install Kyutai’s version via pip.")
14
+ TTSModel = None
15
+
16
+ from notebook_lm_kokoro import (
17
+ generate_podcast_script,
18
+ generate_audio_from_script,
19
+ generate_audio_kyutai,
20
+ KPipeline,
21
+ )
22
 
23
+ warnings.filterwarnings("ignore")
24
+
25
+ def process_segment(entry, voice_map):
26
  speaker, dialogue = entry
27
  chosen_voice = voice_map.get(speaker, "af_heart")
 
 
28
  pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
29
  generator = pipeline(dialogue, voice=chosen_voice)
30
+ return np.concatenate([audio for _, _, audio in generator], axis=0) if generator else None
 
 
 
 
 
 
 
31
 
32
  def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
33
+ print("[DEBUG] Raw transcript string:")
34
+ print(script)
 
 
 
 
 
35
 
36
+ voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
37
  try:
38
  transcript_list = ast.literal_eval(script)
39
  if not isinstance(transcript_list, list):
40
  raise ValueError("Transcript is not a list")
41
 
42
+ entries = [entry for entry in transcript_list if isinstance(entry, tuple) and len(entry) == 2]
43
+ results = [process_segment(entry, voice_map) for entry in entries if entry is not None]
 
44
 
45
+ if not results:
 
 
 
 
 
 
 
 
 
 
46
  return None
 
 
 
 
 
 
47
  sample_rate = 24000
48
  pause = np.zeros(sample_rate, dtype=np.float32)
49
+ final_audio = results[0]
50
+ for seg in results[1:]:
51
  final_audio = np.concatenate((final_audio, pause, seg), axis=0)
 
52
  sf.write(output_file, final_audio, sample_rate)
 
53
  return output_file
 
54
  except Exception as e:
55
+ print(f"Transcript parse error: {e}")
56
  return None
57
 
58
+ def process_pdf(pdf_file, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
59
+ provider, openai_key=None, openrouter_key=None, openrouter_base=None, tts_engine=None):
 
60
  try:
61
+ if provider == "openai" and not openai_key:
62
+ return "OpenAI API key is required", None
63
+ if provider == "openrouter" and not openrouter_key:
64
+ return "OpenRouter API key is required", None
65
+
66
+ if provider in ["openai", "kyutai"]:
67
+ os.environ["OPENAI_API_KEY"] = openai_key or ""
68
  os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
69
+ if provider in ["openrouter", "kyutai"]:
70
+ os.environ["OPENAI_API_KEY"] = openrouter_key or ""
71
  os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
72
 
 
73
  if pdf_file is None:
74
  return "No file uploaded", None
75
 
76
+ tmp_path = pdf_file.name
 
77
 
78
+ script_provider = "openrouter" if provider == "kyutai" and openrouter_key else provider
79
+ transcript, _ = generate_podcast_script(pdf_file.name, provider=script_provider)
 
 
80
 
 
 
81
  if transcript is None:
82
+ return "Transcript generation failed: got None", None
83
+ if not transcript.strip().startswith("["):
84
+ return f"Malformed transcript:\n{transcript}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ audio_path = os.path.join(os.path.dirname(tmp_path), f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}")
 
87
 
88
+ if tts_engine == "kyutai":
89
+ result = generate_audio_kyutai(transcript, kyutai_voice1, kyutai_voice2, audio_path)
90
+ else:
91
+ result = generate_audio_from_script_with_voices(transcript, speaker1_voice, speaker2_voice, audio_path)
92
 
93
+ return ("Process complete!", result) if result else ("Error generating audio", None)
94
  except Exception as e:
95
+ print(f"process_pdf error: {e}")
96
+ return f"Error: {e}", None
97
+
98
+ def update_ui(provider, tts_engine):
99
+ return [
100
+ gr.update(visible=tts_engine == "kokoro"),
101
+ gr.update(visible=tts_engine == "kokoro"),
102
+ gr.update(visible=tts_engine == "kyutai"),
103
+ gr.update(visible=tts_engine == "kyutai"),
104
+ gr.update(visible=provider in ["openai", "kyutai"]),
105
+ gr.update(visible=provider in ["openrouter", "kyutai"]),
106
+ gr.update(visible=provider == "openrouter"),
107
+ ]
108
 
109
  def create_gradio_app():
110
+ css = ".gradio-container {max-width: 900px !important}"
 
 
 
 
111
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
112
+ gr.Markdown("# 🎧 PDF to Podcast — NotebookLM + Kokoro/Kyutai")
113
+
114
+ pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="📄 Upload your PDF", scale=2)
115
+
 
 
 
116
  with gr.Row():
117
+ speaker1_voice = gr.Dropdown(["af_heart", "af_bella", "hf_beta"], value="af_heart", label="Speaker 1 Voice")
118
+ speaker2_voice = gr.Dropdown(["af_nicole", "af_heart", "bf_emma"], value="bf_emma", label="Speaker 2 Voice")
119
+ provider = gr.Radio(["openai", "openrouter"], value="openrouter", label="API Provider")
120
+ openai_key = gr.Textbox(type="password", label="OpenAI Key")
121
+ openrouter_key = gr.Textbox(type="password", label="OpenRouter Key")
122
+ openrouter_base = gr.Textbox(placeholder="https://openrouter.ai/api/v1", label="OpenRouter Base URL")
123
+ tts_engine = gr.Radio(["kokoro", "kyutai"], value="kokoro", label="TTS Engine")
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ with gr.Row():
126
+ kyutai_voice1 = gr.Dropdown([
127
+ "expresso/ex03-ex01_happy_001_channel1_334s.wav",
128
+ "expresso/ex03-ex02_narration_001_channel1_674s.wav",
129
+ "vctk/p226_023_mic1.wav"
130
+ ],
131
+ value="expresso/ex03-ex01_happy_001_channel1_334s.wav",
132
+ label="Kyutai Voice 1",
133
+ visible=True)
134
+
135
+ kyutai_voice2 = gr.Dropdown([
136
+ "expresso/ex03-ex01_happy_001_channel1_334s.wav",
137
+ "expresso/ex03-ex02_narration_001_channel1_674s.wav",
138
+ "vctk/p225_023_mic1.wav"
139
+ ],
140
+ value="expresso/ex03-ex02_narration_001_channel1_674s.wav",
141
+ label="Kyutai Voice 2",
142
+ visible=True)
143
+
144
+ submit_btn = gr.Button("🎙️ Generate Podcast", variant="primary")
145
+ status_output = gr.Textbox(label="📝 Status", interactive=False)
146
+ audio_output = gr.Audio(type="filepath", label="🎵 Your Podcast")
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  submit_btn.click(
149
+ process_pdf,
150
+ inputs=[pdf_input, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
151
+ provider, openai_key, openrouter_key, openrouter_base, tts_engine],
152
+ outputs=[status_output, audio_output]
 
 
 
 
 
 
 
153
  )
154
+
155
+ provider.change(update_ui, [provider, tts_engine],
156
+ [speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
157
+ openai_key, openrouter_key, openrouter_base])
158
+ tts_engine.change(update_ui, [provider, tts_engine],
159
+ [speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
160
+ openai_key, openrouter_key, openrouter_base])
161
+
162
+ gr.Markdown("""
163
+ **📌 Tips**
164
+ - Upload a clean, structured PDF.
165
+ - Pick your API provider and enter relevant keys.
166
+ - Choose the TTS engine and customize voices.
167
+ """)
168
+
169
  return app
170
 
171
  if __name__ == "__main__":
172
+ create_gradio_app().queue().launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, pwa=True)
 
 
 
 
 
 
 
notebook_lm_kokoro.py CHANGED
@@ -12,17 +12,23 @@ If using OpenRouter, you can also set:
12
  """
13
 
14
  from kokoro import KPipeline
15
- from IPython.display import Audio # Only needed if displaying in a notebook
16
  import soundfile as sf
17
  import PyPDF2
18
  import numpy as np
19
  import openai
20
  import os
21
  import shutil
22
- import asyncio
23
  import ast
24
  import json
25
  import warnings
 
 
 
 
 
 
 
 
26
  warnings.filterwarnings("ignore")
27
 
28
  # Set your OpenAI (or OpenRouter) API key from the environment
@@ -30,8 +36,6 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
30
  # For OpenRouter compatibility, set the API base if provided.
31
  openai.api_base = os.getenv("OPENROUTER_API_BASE", "https://api.openai.com/v1")
32
 
33
- pdf = "1706.03762v7.pdf"
34
-
35
 
36
  def pdf_to_prompted_text(pdf_path):
37
  """
@@ -134,7 +138,7 @@ def generate_audio_from_script(script, output_file="podcast_audio.wav"):
134
  # Clean up the script string if needed
135
  script = script.strip()
136
  if not script.startswith("[") or not script.endswith("]"):
137
- print("Invalid transcript format. Expected a list of tuples.")
138
  return
139
 
140
  try:
@@ -147,45 +151,102 @@ def generate_audio_from_script(script, output_file="podcast_audio.wav"):
147
  # Process each dialogue entry
148
  for i, entry in enumerate(transcript_list):
149
  if not isinstance(entry, tuple) or len(entry) != 2:
150
- print(f"Skipping invalid entry {i}: {entry}")
151
  continue
152
 
153
  speaker, dialogue = entry
154
  chosen_voice = voice_map.get(speaker, "af_heart")
155
- print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
156
 
157
- pipeline = KPipeline(lang_code="a")
158
  generator = pipeline(dialogue, voice=chosen_voice)
159
 
160
- segment_audio = []
161
- for j, (gs, ps, audio) in enumerate(generator):
162
- # print(
163
- # f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}"
164
- # )
165
- segment_audio.append(audio)
166
-
167
  if segment_audio:
168
- segment_full = np.concatenate(segment_audio, axis=0)
169
- all_audio_segments.append(segment_full)
170
 
171
  if not all_audio_segments:
172
- print("No audio segments were generated.")
173
  return
174
 
175
  # Add a pause between segments
176
  sample_rate = 24000
177
  pause = np.zeros(sample_rate, dtype=np.float32)
178
- final_audio = all_audio_segments[0]
179
- for seg in all_audio_segments[1:]:
180
- final_audio = np.concatenate((final_audio, pause, seg), axis=0)
181
-
 
182
  sf.write(output_file, final_audio, sample_rate)
183
- print(f"Saved final audio as {output_file}")
184
 
185
  except Exception as e:
186
- print(f"Error processing transcript: {e}")
 
 
187
  return
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def generate_tts():
191
  pipeline = KPipeline(lang_code="a")
@@ -222,25 +283,23 @@ def generate_podcast_script(
222
  Set provider="openrouter" to use OpenRouter, otherwise uses OpenAI.
223
  """
224
  pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
225
-
226
- # Use /tmp if writable, else fallback to current working directory
227
- base_dir = "/tmp" if os.access("/tmp", os.W_OK) else os.getcwd()
228
- folder = os.path.join(base_dir, pdf_basename)
229
  os.makedirs(folder, exist_ok=True)
230
 
231
  destination_pdf = os.path.join(folder, os.path.basename(pdf_path))
232
- if not os.path.exists(destination_pdf):
233
  shutil.copy(pdf_path, destination_pdf)
234
- print(f"Copied {pdf_path} to {destination_pdf}")
235
- else:
236
- print(f"PDF already copied at {destination_pdf}")
 
237
 
238
  transcript_path = os.path.join(folder, output_file)
239
  # If transcript exists, load and return it without calling the API.
240
  if os.path.exists(transcript_path):
241
  with open(transcript_path, "r") as f:
242
  transcript = f.read()
243
- print(f"Transcript loaded from {transcript_path}")
244
  return transcript, transcript_path
245
 
246
  # Otherwise, generate the transcript.
@@ -265,15 +324,15 @@ def generate_podcast_script(
265
  if provider == "openrouter":
266
  api_key = os.getenv("OPENAI_API_KEY")
267
  base_url = os.getenv("OPENROUTER_API_BASE", "https://openrouter.ai/api/v1")
268
- print("Using OpenRouter API endpoint.")
269
  else:
270
  api_key = os.getenv("OPENAI_API_KEY")
271
  base_url = "https://api.openai.com/v1"
272
- print("Using OpenAI API endpoint.")
273
 
274
  client = openai.OpenAI(api_key=api_key, base_url=base_url)
275
 
276
- print(f"Sending request to {base_url} to generate a podcast script...")
277
  response = client.chat.completions.create(
278
  model="gpt-4o-mini",
279
  messages=messages,
@@ -298,10 +357,10 @@ def generate_podcast_script(
298
  transcript_list = []
299
  for i, entry in enumerate(dialogue):
300
  if not isinstance(entry, list) or len(entry) != 2:
301
- print(f"Skipping invalid dialogue entry {i}: {entry}")
302
  continue
303
  if entry[0] not in ["Speaker 1", "Speaker 2"]:
304
- print(f"Invalid speaker label in entry {i}: {entry[0]}")
305
  continue
306
  transcript_list.append(tuple(entry))
307
 
@@ -312,31 +371,26 @@ def generate_podcast_script(
312
  script = str(transcript_list)
313
 
314
  except json.JSONDecodeError as e:
315
- print(f"Error: Invalid JSON response from API: {e}")
316
- print(f"Raw response: {response.choices[0].message.content}")
317
  return None, None
318
  except Exception as e:
319
- print(f"Error processing response: {e}")
320
  return None, None
321
 
322
  # Save the transcript
323
  with open(transcript_path, "w") as f:
324
  f.write(script)
325
- print(f"Saved podcast script as {transcript_path}")
326
 
327
  return script, transcript_path
328
 
329
 
330
- async def _generate_script_async(messages):
331
- response = await openai.ChatCompletion.acreate(
332
- model="gpt-4o-mini", messages=messages, temperature=0.7, max_tokens=20000
333
- )
334
- return response["choices"][0]["message"]["content"]
335
-
336
 
 
337
  if __name__ == "__main__":
338
- # For example, to generate a podcast script from the PDF using OpenRouter or OpenAI:
339
  transcript, transcript_path = generate_podcast_script(pdf, provider="openrouter")
340
- # Use the transcript to generate and save the audio. The output file is stored in the same folder.
341
- audio_output = transcript_path.replace(".txt", ".wav")
342
- generate_audio_from_script(transcript, output_file=audio_output)
 
12
  """
13
 
14
  from kokoro import KPipeline
 
15
  import soundfile as sf
16
  import PyPDF2
17
  import numpy as np
18
  import openai
19
  import os
20
  import shutil
 
21
  import ast
22
  import json
23
  import warnings
24
+ import torch
25
+ import time
26
+ try:
27
+ from moshi.models.loaders import CheckpointInfo
28
+ from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel
29
+ except ImportError:
30
+ CheckpointInfo = None
31
+ TTSModel = None
32
  warnings.filterwarnings("ignore")
33
 
34
  # Set your OpenAI (or OpenRouter) API key from the environment
 
36
  # For OpenRouter compatibility, set the API base if provided.
37
  openai.api_base = os.getenv("OPENROUTER_API_BASE", "https://api.openai.com/v1")
38
 
 
 
39
 
40
  def pdf_to_prompted_text(pdf_path):
41
  """
 
138
  # Clean up the script string if needed
139
  script = script.strip()
140
  if not script.startswith("[") or not script.endswith("]"):
141
+ print("[ERROR] Invalid transcript format. Expected a list of tuples.")
142
  return
143
 
144
  try:
 
151
  # Process each dialogue entry
152
  for i, entry in enumerate(transcript_list):
153
  if not isinstance(entry, tuple) or len(entry) != 2:
154
+ print(f"[WARNING] Skipping invalid entry {i}: {entry}")
155
  continue
156
 
157
  speaker, dialogue = entry
158
  chosen_voice = voice_map.get(speaker, "af_heart")
159
+ print(f"[INFO] Generating audio for {speaker} with voice '{chosen_voice}'...")
160
 
161
+ pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
162
  generator = pipeline(dialogue, voice=chosen_voice)
163
 
164
+ segment_audio = [audio for _, _, audio in generator]
 
 
 
 
 
 
165
  if segment_audio:
166
+ all_audio_segments.append(np.concatenate(segment_audio, axis=0))
 
167
 
168
  if not all_audio_segments:
169
+ print("[ERROR] No audio segments were generated.")
170
  return
171
 
172
  # Add a pause between segments
173
  sample_rate = 24000
174
  pause = np.zeros(sample_rate, dtype=np.float32)
175
+ final_audio = np.concatenate(
176
+ [seg if i == 0 else np.concatenate((pause, seg), axis=0)
177
+ for i, seg in enumerate(all_audio_segments)],
178
+ axis=0
179
+ )
180
  sf.write(output_file, final_audio, sample_rate)
181
+ print(f"[INFO] Saved final audio as {output_file}")
182
 
183
  except Exception as e:
184
+ import traceback
185
+ print(f"[ERROR] Exception while parsing transcript or generating audio: {e}")
186
+ traceback.print_exc()
187
  return
188
 
189
+ def generate_audio_kyutai(script, speaker1_voice=None, speaker2_voice=None, output_file="kyutai_audio.wav"):
190
+ if TTSModel is None:
191
+ print("Moshi is not installed.")
192
+ return None
193
+
194
+ try:
195
+ print(f"[INFO] Requested Kyutai voices: {speaker1_voice=}, {speaker2_voice=}")
196
+ # Reject absolute/local paths
197
+ if os.path.isabs(speaker1_voice) or os.path.isfile(speaker1_voice):
198
+ raise ValueError(f"❌ Invalid voice path for speaker1: {speaker1_voice}")
199
+ if os.path.isabs(speaker2_voice) or os.path.isfile(speaker2_voice):
200
+ raise ValueError(f"❌ Invalid voice path for speaker2: {speaker2_voice}")
201
+
202
+ transcript_list = ast.literal_eval(script)
203
+
204
+ # Load TTS model
205
+ checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
206
+ tts_model = TTSModel.from_checkpoint_info(
207
+ checkpoint_info,
208
+ n_q=32,
209
+ temp=0.6,
210
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
211
+ )
212
+
213
+ # Use voice names directly from dropdown
214
+ print("[INFO] Resolving voice paths...")
215
+
216
+ start = time.time()
217
+ voice1_path = tts_model.get_voice_path(speaker1_voice)
218
+ print(f"[INFO] Got voice1_path in {time.time() - start:.2f}s")
219
+
220
+ start = time.time()
221
+ voice2_path = tts_model.get_voice_path(speaker2_voice)
222
+ print(f"[INFO] Got voice2_path in {time.time() - start:.2f}s")
223
+
224
+ texts = [dialogue for _, dialogue in transcript_list]
225
+ entries = tts_model.prepare_script(texts, padding_between=1)
226
+
227
+ condition_attributes = tts_model.make_condition_attributes([voice1_path, voice2_path], cfg_coef=2.0)
228
+
229
+ pcms = []
230
+ def _on_frame(frame):
231
+ if (frame != -1).all():
232
+ pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
233
+ pcms.append(np.clip(pcm[0, 0], -1, 1))
234
+
235
+ with tts_model.mimi.streaming(1):
236
+ tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)
237
+
238
+ if pcms:
239
+ audio = np.concatenate(pcms, axis=-1)
240
+ sf.write(output_file, audio, tts_model.mimi.sample_rate)
241
+ print(f"[SUCCESS] Audio saved to: {output_file}")
242
+ return output_file
243
+
244
+ print("[WARNING] No audio segments were produced.")
245
+ return None
246
+
247
+ except Exception as e:
248
+ print(f"[ERROR] Kyutai TTS error: {e}")
249
+ return None
250
 
251
  def generate_tts():
252
  pipeline = KPipeline(lang_code="a")
 
283
  Set provider="openrouter" to use OpenRouter, otherwise uses OpenAI.
284
  """
285
  pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
286
+ folder = os.path.join("/tmp", pdf_basename)
 
 
 
287
  os.makedirs(folder, exist_ok=True)
288
 
289
  destination_pdf = os.path.join(folder, os.path.basename(pdf_path))
290
+ try:
291
  shutil.copy(pdf_path, destination_pdf)
292
+ print(f"[INFO] Copied {pdf_path} to {destination_pdf}")
293
+ except PermissionError:
294
+ print(f"[WARNING] Cannot copy PDF to {destination_pdf}, using original path.")
295
+ destination_pdf = pdf_path # fallback
296
 
297
  transcript_path = os.path.join(folder, output_file)
298
  # If transcript exists, load and return it without calling the API.
299
  if os.path.exists(transcript_path):
300
  with open(transcript_path, "r") as f:
301
  transcript = f.read()
302
+ print(f"[INFO] Transcript loaded from {transcript_path}")
303
  return transcript, transcript_path
304
 
305
  # Otherwise, generate the transcript.
 
324
  if provider == "openrouter":
325
  api_key = os.getenv("OPENAI_API_KEY")
326
  base_url = os.getenv("OPENROUTER_API_BASE", "https://openrouter.ai/api/v1")
327
+ print("[INFO] Using OpenRouter API endpoint.")
328
  else:
329
  api_key = os.getenv("OPENAI_API_KEY")
330
  base_url = "https://api.openai.com/v1"
331
+ print("[INFO] Using OpenAI API endpoint.")
332
 
333
  client = openai.OpenAI(api_key=api_key, base_url=base_url)
334
 
335
+ print(f"[INFO] Sending request to {base_url} to generate a podcast script...")
336
  response = client.chat.completions.create(
337
  model="gpt-4o-mini",
338
  messages=messages,
 
357
  transcript_list = []
358
  for i, entry in enumerate(dialogue):
359
  if not isinstance(entry, list) or len(entry) != 2:
360
+ print(f"[WARNING] Skipping invalid dialogue entry {i}: {entry}")
361
  continue
362
  if entry[0] not in ["Speaker 1", "Speaker 2"]:
363
+ print(f"[WARNING] Invalid speaker label in entry {i}: {entry[0]}")
364
  continue
365
  transcript_list.append(tuple(entry))
366
 
 
371
  script = str(transcript_list)
372
 
373
  except json.JSONDecodeError as e:
374
+ print(f"[ERROR] Invalid JSON response from API: {e}")
375
+ print(f"[ERROR] Raw response: {response.choices[0].message.content}")
376
  return None, None
377
  except Exception as e:
378
+ print(f"[ERROR] Error processing response: {e}")
379
  return None, None
380
 
381
  # Save the transcript
382
  with open(transcript_path, "w") as f:
383
  f.write(script)
384
+ print(f"[INFO] Saved podcast script as {transcript_path}")
385
 
386
  return script, transcript_path
387
 
388
 
 
 
 
 
 
 
389
 
390
+ # Minimal test harness
391
  if __name__ == "__main__":
392
+ pdf = "1706.03762v7.pdf"
393
  transcript, transcript_path = generate_podcast_script(pdf, provider="openrouter")
394
+ if transcript and transcript_path:
395
+ audio_output = transcript_path.replace(".txt", ".wav")
396
+ generate_audio_from_script(transcript, output_file=audio_output)