udayl commited on
Commit
3ac5c08
·
1 Parent(s): 071f6c6
Files changed (4) hide show
  1. README.md +60 -14
  2. gradio_app.py +156 -0
  3. notebook_lm_kokoro.py +339 -0
  4. pyproject.toml +15 -0
README.md CHANGED
@@ -1,14 +1,60 @@
1
- ---
2
- title: NotebookLM-Kokoro TTS App
3
- emoji: 💻
4
- colorFrom: green
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.35.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: NotebookLM-Kokoro TTS App
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NotebookLM-Kokoro TTS Project
2
+
3
+ This project uses [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) – a lightweight, open-weight TTS model with 82 million parameters – to create a Google NotebookLM style Text-to-Speech application.
4
+
5
+ ## Why Kokoro?
6
+
7
+ - **Non-Proprietary & Open-Source:** Kokoro is best in its class as a non-proprietary model, giving you full flexibility to deploy in production environments or personal projects.
8
+ - **High Efficiency:** Despite its lightweight architecture, Kokoro delivers comparable quality to larger models while being faster and more cost-efficient.
9
+ - **Benchmarks:** According to benchmarks available on the [TTS-Arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena) page, Kokoro outperforms many closed-source models, making it the ideal choice for open deployments.
10
+ - **Easy Integration:** With simple pip and Homebrew installation for dependencies like espeak-ng, integration into Python projects is straightforward.
11
+
12
+ ## Setup Instructions
13
+
14
+ ### Environment Setup
15
+
16
+ This project uses the **uv** Python package manager. Follow these steps:
17
+
18
+ 1. **Install uv:**
19
+
20
+ ```bash
21
+ pip install uv
22
+ ```
23
+
24
+ 2. **Create a new environment named `notebooklm`:**
25
+
26
+ ```bash
27
+ uv venv
28
+ ```
29
+
30
+ 3. **Activate the environment:**
31
+
32
+ ```bash
33
+ source .venv/bin/activate
34
+ ```
35
+
36
+ 4. **Install Python dependencies:**
37
+
38
+ ```bash
39
+ pip install "kokoro>=0.9.2" soundfile torch
40
+ ```
41
+
42
+ 5. **Install espeak-ng (Mac users):**
43
+
44
+ ```bash
45
+ brew install espeak-ng
46
+ ```
47
+
48
+ ### Running the Application
49
+
50
+ Once the environment is set up, run the main TTS script as follows:
51
+
52
+ ```bash
53
+ python notebook_lm_kokoro.py
54
+ ```
55
+
56
+ This will process the transcript text using Kokoro and output audio segments as WAV files.
57
+
58
+ ## Conclusion
59
+
60
+ Kokoro’s combination of efficiency, quality, and open-access makes it the best non-proprietary TTS model available, as confirmed by recent benchmarks. Enjoy exploring and extending this project!
gradio_app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py
2
+ import os
3
+ import tempfile
4
+ import gradio as gr
5
+ from notebook_lm_kokoro import generate_podcast_script, KPipeline
6
+ import soundfile as sf
7
+ import numpy as np
8
+ import ast
9
+ import shutil
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ # A modified version of generate_audio_from_script to accept voice mapping
14
+ def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
15
+ voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
16
+
17
+ # Clean up the script string if needed
18
+ script = script.strip()
19
+ if not script.startswith("[") or not script.endswith("]"):
20
+ print("Invalid transcript format. Expected a list of tuples.")
21
+ return None
22
+
23
+ try:
24
+ transcript_list = ast.literal_eval(script)
25
+ if not isinstance(transcript_list, list):
26
+ raise ValueError("Transcript is not a list")
27
+
28
+ all_audio_segments = []
29
+ # Process each dialogue entry
30
+ for i, entry in enumerate(transcript_list):
31
+ if not isinstance(entry, tuple) or len(entry) != 2:
32
+ print(f"Skipping invalid entry {i}: {entry}")
33
+ continue
34
+
35
+ speaker, dialogue = entry
36
+ chosen_voice = voice_map.get(speaker, "af_heart")
37
+ print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
38
+
39
+ pipeline = KPipeline(lang_code="a")
40
+ generator = pipeline(dialogue, voice=chosen_voice)
41
+
42
+ segment_audio = []
43
+ for j, (gs, ps, audio) in enumerate(generator):
44
+ # print(f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}")
45
+ segment_audio.append(audio)
46
+
47
+ if segment_audio:
48
+ segment_full = np.concatenate(segment_audio, axis=0)
49
+ all_audio_segments.append(segment_full)
50
+
51
+ if not all_audio_segments:
52
+ print("No audio segments were generated.")
53
+ return None
54
+
55
+ # Add a pause between segments
56
+ sample_rate = 24000
57
+ pause = np.zeros(sample_rate, dtype=np.float32)
58
+ final_audio = all_audio_segments[0]
59
+ for seg in all_audio_segments[1:]:
60
+ final_audio = np.concatenate((final_audio, pause, seg), axis=0)
61
+
62
+ sf.write(output_file, final_audio, sample_rate)
63
+ print(f"Saved final audio as {output_file}")
64
+ return output_file
65
+
66
+ except Exception as e:
67
+ print(f"Error processing transcript: {e}")
68
+ return None
69
+
70
+
71
+ def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider):
72
+ """Process the uploaded PDF file and generate audio"""
73
+ try:
74
+ # Check if we received a valid file
75
+ if pdf_file is None:
76
+ return "No file uploaded", None
77
+
78
+ # Create a temporary file with .pdf extension
79
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
80
+ # For Gradio uploads, we need to copy the file
81
+ shutil.copy2(pdf_file.name, tmp.name)
82
+ tmp_path = tmp.name
83
+
84
+ print(f"Uploaded PDF saved at {tmp_path}")
85
+
86
+ # Generate transcript using your existing function
87
+ transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider)
88
+ if transcript is None:
89
+ return "Error generating transcript", None
90
+
91
+ # Define an output file path for the generated audio
92
+ audio_output_path = os.path.join(
93
+ os.path.dirname(tmp_path),
94
+ f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}"
95
+ )
96
+
97
+ result = generate_audio_from_script_with_voices(
98
+ transcript,
99
+ speaker1_voice,
100
+ speaker2_voice,
101
+ output_file=audio_output_path
102
+ )
103
+
104
+ if result is None:
105
+ return "Error generating audio", None
106
+
107
+ return "Process complete!", result
108
+
109
+ except Exception as e:
110
+ print(f"Error in process_pdf: {str(e)}")
111
+ return f"Error processing file: {str(e)}", None
112
+
113
+
114
+ def create_gradio_app():
115
+ with gr.Blocks() as app:
116
+ gr.Markdown("# NotebookLM-Kokoro TTS App")
117
+ gr.Markdown("Upload a PDF, choose voices, and generate TTS audio using Kokoro.")
118
+
119
+ with gr.Row():
120
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
121
+
122
+ with gr.Row():
123
+ with gr.Column():
124
+ speaker1_voice = gr.Dropdown(
125
+ choices=["af_heart", "af_bella", "hf_beta"],
126
+ value="af_heart",
127
+ label="Speaker 1 Voice"
128
+ )
129
+ speaker2_voice = gr.Dropdown(
130
+ choices=["af_nicole", "af_heart", "bf_emma"],
131
+ value="af_nicole",
132
+ label="Speaker 2 Voice"
133
+ )
134
+ provider = gr.Radio(
135
+ choices=["openai", "openrouter"],
136
+ value="openrouter",
137
+ label="API Provider (TTS Script Generation)"
138
+ )
139
+ submit_btn = gr.Button("Generate Audio")
140
+
141
+ with gr.Row():
142
+ status_output = gr.Textbox(label="Status")
143
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
144
+
145
+ submit_btn.click(
146
+ fn=process_pdf,
147
+ inputs=[pdf_input, speaker1_voice, speaker2_voice, provider],
148
+ outputs=[status_output, audio_output]
149
+ )
150
+
151
+ return app
152
+
153
+
154
+ if __name__ == "__main__":
155
+ demo = create_gradio_app()
156
+ demo.launch(share=True) # add share=True to get a public URL
notebook_lm_kokoro.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Before running this script, ensure you have installed the dependencies:
3
+ pip install kokoro>=0.9.2 soundfile torch PyPDF2 numpy openai
4
+ Also, if needed, install espeak-ng (on Mac, you might use Homebrew):
5
+ brew install espeak-ng
6
+
7
+ Set your OpenAI (or OpenRouter) API key as an environment variable:
8
+ export OPENAI_API_KEY="your_api_key"
9
+
10
+ If using OpenRouter, you can also set:
11
+ export OPENROUTER_API_BASE="https://openrouter.ai/api/v1"
12
+ """
13
+
14
+ from kokoro import KPipeline
15
+ from IPython.display import Audio # Only needed if displaying in a notebook
16
+ import soundfile as sf
17
+ import PyPDF2
18
+ import numpy as np
19
+ import openai
20
+ import os
21
+ import shutil
22
+ import asyncio
23
+ import ast
24
+ import json
25
+ import warnings
26
+ warnings.filterwarnings("ignore")
27
+
28
+ # Set your OpenAI (or OpenRouter) API key from the environment
29
+ openai.api_key = os.getenv("OPENAI_API_KEY")
30
+ # For OpenRouter compatibility, set the API base if provided.
31
+ openai.api_base = os.getenv("OPENROUTER_API_BASE", "https://api.openai.com/v1")
32
+
33
+ pdf = "1706.03762v7.pdf"
34
+
35
+
36
+ def pdf_to_prompted_text(pdf_path):
37
+ """
38
+ Reads a PDF file and returns its text, wrapped with the system prompts.
39
+ """
40
+ with open(pdf_path, "rb") as f:
41
+ reader = PyPDF2.PdfReader(f)
42
+ pdf_text = ""
43
+ for page in reader.pages:
44
+ pdf_text += page.extract_text() or ""
45
+
46
+ prompted_text = f"""
47
+ Transcript Writer System Prompt:
48
+ {TRANSCRIPT_WRITER_SYSTEM_PROMPT}
49
+
50
+ Transcript Rewriter System Prompt:
51
+ {TRANSCRIPT_REWRITER_SYSTEM_PROMPT}
52
+
53
+ PDF Content:
54
+ {pdf_text}
55
+ """
56
+ return prompted_text
57
+
58
+
59
+ # System prompt constants
60
+ TRANSCRIPT_WRITER_SYSTEM_PROMPT = """
61
+ You are a world-class storyteller and you have worked as a ghost writer.
62
+ Welcome the listeners by talking about the Chapter Title.
63
+ You will be talking to a guest.
64
+
65
+ Do not address the other speaker as Speaker 1 or Speaker 2.
66
+
67
+ Instructions for Speaker 1:
68
+ Speaker 1: Leads the conversation and teaches the guest, giving incredible anecdotes and analogies when explaining. A captivating teacher with great anecdotes.
69
+ Speaker 1: Do not address the guest as Speaker 2.
70
+ Remember the guest is new to the topic and the conversation should always feature realistic anecdotes and analogies with real-world example follow ups.
71
+
72
+ Instructions for Speaker 2:
73
+ Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. A curious mindset that asks very interesting confirmation questions.
74
+ Speaker 2: Do not address the other speaker as Speaker 1.
75
+ Make sure the tangents provided are quite wild or interesting.
76
+
77
+ ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1.
78
+ IT SHOULD STRICTLY BE THE DIALOGUES.
79
+ """
80
+
81
+ TRANSCRIPT_REWRITER_SYSTEM_PROMPT = """
82
+ You are an international Oscar-winning screenwriter and you have worked with multiple award-winning teams.
83
+
84
+ Your job is to use the transcript written below to re-write it for an AI Text-To-Speech Pipeline.
85
+ A very dumb AI had written this so you have to step up for your kind.
86
+
87
+ Make it as engaging as possible; Speaker 1 and the guest will be simulated by different voice engines.
88
+ Remember the guest is new to the topic and the conversation should always include realistic anecdotes and analogies, with real-world example follow ups.
89
+ Ensure the guest's contributions include wild or interesting tangents and occasional interruptions ("hmm", "umm", etc.).
90
+
91
+ It should be a real story with every nuance documented in detail.
92
+
93
+ IMPORTANT FORMAT INSTRUCTIONS:
94
+ You must return a JSON array of arrays, where each inner array contains exactly two strings:
95
+ 1. The speaker label (either "Speaker 1" or "Speaker 2")
96
+ 2. The dialogue text
97
+
98
+ Example format:
99
+ [
100
+ ["Speaker 1", "Welcome everyone..."],
101
+ ["Speaker 2", "Thanks for having me..."],
102
+ ["Speaker 1", "Let me explain..."]
103
+ ]
104
+
105
+ YOUR RESPONSE MUST BE VALID JSON.
106
+ NO OTHER TEXT BEFORE OR AFTER THE JSON ARRAY.
107
+ """
108
+
109
+
110
+ def generate_tts_from_pdf(pdf_path, output_file="final_output.wav"):
111
+ pipeline = KPipeline(lang_code="a")
112
+ text = pdf_to_prompted_text(pdf_path)
113
+ generator = pipeline(text, voice="af_heart")
114
+
115
+ audio_segments = []
116
+ for i, (gs, ps, audio) in enumerate(generator):
117
+ print(f"Segment {i}: Global Step = {gs}, Partial Step = {ps}")
118
+ audio_segments.append(audio)
119
+ print(f"Collected audio segment {i}")
120
+
121
+ # Concatenate all audio segments into a single array and write one wav file.
122
+ final_audio = np.concatenate(audio_segments, axis=0)
123
+ sf.write(output_file, final_audio, 24000)
124
+ print(f"Saved final audio as {output_file}")
125
+
126
+
127
+ def generate_audio_from_script(script, output_file="podcast_audio.wav"):
128
+ """
129
+ Uses Kokoro TTS to generate audio from the provided transcript.
130
+ Expects a transcript in the format of a list of tuples: [("Speaker 1", "dialogue"), ("Speaker 2", "dialogue"), ...]
131
+ """
132
+ voice_map = {"Speaker 1": "af_heart", "Speaker 2": "af_nicole"}
133
+
134
+ # Clean up the script string if needed
135
+ script = script.strip()
136
+ if not script.startswith("[") or not script.endswith("]"):
137
+ print("Invalid transcript format. Expected a list of tuples.")
138
+ return
139
+
140
+ try:
141
+ # Parse the transcript
142
+ transcript_list = ast.literal_eval(script)
143
+ if not isinstance(transcript_list, list):
144
+ raise ValueError("Transcript is not a list")
145
+
146
+ all_audio_segments = []
147
+ # Process each dialogue entry
148
+ for i, entry in enumerate(transcript_list):
149
+ if not isinstance(entry, tuple) or len(entry) != 2:
150
+ print(f"Skipping invalid entry {i}: {entry}")
151
+ continue
152
+
153
+ speaker, dialogue = entry
154
+ chosen_voice = voice_map.get(speaker, "af_heart")
155
+ print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
156
+
157
+ pipeline = KPipeline(lang_code="a")
158
+ generator = pipeline(dialogue, voice=chosen_voice)
159
+
160
+ segment_audio = []
161
+ for j, (gs, ps, audio) in enumerate(generator):
162
+ # print(
163
+ # f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}"
164
+ # )
165
+ segment_audio.append(audio)
166
+
167
+ if segment_audio:
168
+ segment_full = np.concatenate(segment_audio, axis=0)
169
+ all_audio_segments.append(segment_full)
170
+
171
+ if not all_audio_segments:
172
+ print("No audio segments were generated.")
173
+ return
174
+
175
+ # Add a pause between segments
176
+ sample_rate = 24000
177
+ pause = np.zeros(sample_rate, dtype=np.float32)
178
+ final_audio = all_audio_segments[0]
179
+ for seg in all_audio_segments[1:]:
180
+ final_audio = np.concatenate((final_audio, pause, seg), axis=0)
181
+
182
+ sf.write(output_file, final_audio, sample_rate)
183
+ print(f"Saved final audio as {output_file}")
184
+
185
+ except Exception as e:
186
+ print(f"Error processing transcript: {e}")
187
+ return
188
+
189
+
190
+ def generate_tts():
191
+ pipeline = KPipeline(lang_code="a")
192
+ text = f"""
193
+ [Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models and is significantly faster and more cost-efficient.
194
+ With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
195
+
196
+ Transcript Writer System Prompt:
197
+ {TRANSCRIPT_WRITER_SYSTEM_PROMPT}
198
+
199
+ Transcript Rewriter System Prompt:
200
+ {TRANSCRIPT_REWRITER_SYSTEM_PROMPT}
201
+ """
202
+
203
+ generator = pipeline(text, voice="af_heart")
204
+ audio_segments = []
205
+ for i, (gs, ps, audio) in enumerate(generator):
206
+ print(f"Segment {i}: Global Step = {gs}, Partial Step = {ps}")
207
+ audio_segments.append(audio)
208
+ print(f"Collected audio segment {i}")
209
+
210
+ final_audio = np.concatenate(audio_segments, axis=0)
211
+ sf.write("final_output.wav", final_audio, 24000)
212
+ print("Saved final audio as final_output.wav")
213
+
214
+
215
+ def generate_podcast_script(
216
+ pdf_path, output_file="podcast_script.txt", provider="openai"
217
+ ):
218
+ """
219
+ Reads the PDF, wraps it with your system prompts, and then uses the ChatCompletion API
220
+ (OpenAI or OpenRouter) to rewrite the PDF content as a podcast-style script using "gpt-4o-mini".
221
+ The generated transcript is stored in a folder (named after the PDF file) along with a copy of the PDF.
222
+ Set provider="openrouter" to use OpenRouter, otherwise uses OpenAI.
223
+ """
224
+ pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
225
+ folder = os.path.join(os.getcwd(), pdf_basename)
226
+ os.makedirs(folder, exist_ok=True)
227
+
228
+ destination_pdf = os.path.join(folder, os.path.basename(pdf_path))
229
+ if not os.path.exists(destination_pdf):
230
+ shutil.copy(pdf_path, destination_pdf)
231
+ print(f"Copied {pdf_path} to {destination_pdf}")
232
+ else:
233
+ print(f"PDF already copied at {destination_pdf}")
234
+
235
+ transcript_path = os.path.join(folder, output_file)
236
+ # If transcript exists, load and return it without calling the API.
237
+ if os.path.exists(transcript_path):
238
+ with open(transcript_path, "r") as f:
239
+ transcript = f.read()
240
+ print(f"Transcript loaded from {transcript_path}")
241
+ return transcript, transcript_path
242
+
243
+ # Otherwise, generate the transcript.
244
+ text = pdf_to_prompted_text(pdf_path)
245
+
246
+ messages = [
247
+ {"role": "system", "content": TRANSCRIPT_REWRITER_SYSTEM_PROMPT},
248
+ {
249
+ "role": "user",
250
+ "content": (
251
+ "Convert the following text into a dialogue between two speakers.\n\n"
252
+ "REQUIREMENTS:\n"
253
+ "1. Return ONLY a JSON object with a single key 'dialogue' containing an array of arrays\n"
254
+ "2. Each inner array must have exactly 2 elements: speaker label and dialogue text\n"
255
+ "3. Speaker labels must be either 'Speaker 1' or 'Speaker 2'\n"
256
+ "4. The conversation should be engaging and include analogies\n\n"
257
+ "TEXT TO CONVERT:\n" + text
258
+ ),
259
+ },
260
+ ]
261
+
262
+ if provider == "openrouter":
263
+ api_key = os.getenv("OPENAI_API_KEY")
264
+ base_url = os.getenv("OPENROUTER_API_BASE", "https://openrouter.ai/api/v1")
265
+ print("Using OpenRouter API endpoint.")
266
+ else:
267
+ api_key = os.getenv("OPENAI_API_KEY")
268
+ base_url = "https://api.openai.com/v1"
269
+ print("Using OpenAI API endpoint.")
270
+
271
+ client = openai.OpenAI(api_key=api_key, base_url=base_url)
272
+
273
+ print(f"Sending request to {base_url} to generate a podcast script...")
274
+ response = client.chat.completions.create(
275
+ model="gpt-4o-mini",
276
+ messages=messages,
277
+ temperature=0.7,
278
+ max_tokens=50000,
279
+ response_format={"type": "json_object"} # Force JSON response
280
+ )
281
+
282
+ try:
283
+ # Parse the JSON response
284
+ content = json.loads(response.choices[0].message.content)
285
+
286
+ # Validate the JSON structure
287
+ if not isinstance(content, dict) or 'dialogue' not in content:
288
+ raise ValueError("Response missing 'dialogue' key")
289
+
290
+ dialogue = content['dialogue']
291
+ if not isinstance(dialogue, list):
292
+ raise ValueError("Dialogue must be an array")
293
+
294
+ # Validate and convert each dialogue entry
295
+ transcript_list = []
296
+ for i, entry in enumerate(dialogue):
297
+ if not isinstance(entry, list) or len(entry) != 2:
298
+ print(f"Skipping invalid dialogue entry {i}: {entry}")
299
+ continue
300
+ if entry[0] not in ["Speaker 1", "Speaker 2"]:
301
+ print(f"Invalid speaker label in entry {i}: {entry[0]}")
302
+ continue
303
+ transcript_list.append(tuple(entry))
304
+
305
+ if not transcript_list:
306
+ raise ValueError("No valid dialogue entries found")
307
+
308
+ # Convert to string format for storage
309
+ script = str(transcript_list)
310
+
311
+ except json.JSONDecodeError as e:
312
+ print(f"Error: Invalid JSON response from API: {e}")
313
+ print(f"Raw response: {response.choices[0].message.content}")
314
+ return None, None
315
+ except Exception as e:
316
+ print(f"Error processing response: {e}")
317
+ return None, None
318
+
319
+ # Save the transcript
320
+ with open(transcript_path, "w") as f:
321
+ f.write(script)
322
+ print(f"Saved podcast script as {transcript_path}")
323
+
324
+ return script, transcript_path
325
+
326
+
327
+ async def _generate_script_async(messages):
328
+ response = await openai.ChatCompletion.acreate(
329
+ model="gpt-4o-mini", messages=messages, temperature=0.7, max_tokens=20000
330
+ )
331
+ return response["choices"][0]["message"]["content"]
332
+
333
+
334
+ if __name__ == "__main__":
335
+ # For example, to generate a podcast script from the PDF using OpenRouter or OpenAI:
336
+ transcript, transcript_path = generate_podcast_script(pdf, provider="openrouter")
337
+ # Use the transcript to generate and save the audio. The output file is stored in the same folder.
338
+ audio_output = transcript_path.replace(".txt", ".wav")
339
+ generate_audio_from_script(transcript, output_file=audio_output)
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "notebooklm"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "kokoro>=0.9.2",
9
+ "soundfile",
10
+ "torch",
11
+ "PyPDF2",
12
+ "numpy",
13
+ "openai",
14
+ "ipython"
15
+ ]