Spaces:
Sleeping
Sleeping
init
Browse files- README.md +60 -14
- gradio_app.py +156 -0
- notebook_lm_kokoro.py +339 -0
- pyproject.toml +15 -0
README.md
CHANGED
@@ -1,14 +1,60 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# NotebookLM-Kokoro TTS Project
|
2 |
+
|
3 |
+
This project uses [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) – a lightweight, open-weight TTS model with 82 million parameters – to create a Google NotebookLM style Text-to-Speech application.
|
4 |
+
|
5 |
+
## Why Kokoro?
|
6 |
+
|
7 |
+
- **Non-Proprietary & Open-Source:** Kokoro is best in its class as a non-proprietary model, giving you full flexibility to deploy in production environments or personal projects.
|
8 |
+
- **High Efficiency:** Despite its lightweight architecture, Kokoro delivers comparable quality to larger models while being faster and more cost-efficient.
|
9 |
+
- **Benchmarks:** According to benchmarks available on the [TTS-Arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena) page, Kokoro outperforms many closed-source models, making it the ideal choice for open deployments.
|
10 |
+
- **Easy Integration:** With simple pip and Homebrew installation for dependencies like espeak-ng, integration into Python projects is straightforward.
|
11 |
+
|
12 |
+
## Setup Instructions
|
13 |
+
|
14 |
+
### Environment Setup
|
15 |
+
|
16 |
+
This project uses the **uv** Python package manager. Follow these steps:
|
17 |
+
|
18 |
+
1. **Install uv:**
|
19 |
+
|
20 |
+
```bash
|
21 |
+
pip install uv
|
22 |
+
```
|
23 |
+
|
24 |
+
2. **Create a new environment named `notebooklm`:**
|
25 |
+
|
26 |
+
```bash
|
27 |
+
uv venv
|
28 |
+
```
|
29 |
+
|
30 |
+
3. **Activate the environment:**
|
31 |
+
|
32 |
+
```bash
|
33 |
+
source .venv/bin/activate
|
34 |
+
```
|
35 |
+
|
36 |
+
4. **Install Python dependencies:**
|
37 |
+
|
38 |
+
```bash
|
39 |
+
pip install "kokoro>=0.9.2" soundfile torch
|
40 |
+
```
|
41 |
+
|
42 |
+
5. **Install espeak-ng (Mac users):**
|
43 |
+
|
44 |
+
```bash
|
45 |
+
brew install espeak-ng
|
46 |
+
```
|
47 |
+
|
48 |
+
### Running the Application
|
49 |
+
|
50 |
+
Once the environment is set up, run the main TTS script as follows:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
python notebook_lm_kokoro.py
|
54 |
+
```
|
55 |
+
|
56 |
+
This will process the transcript text using Kokoro and output audio segments as WAV files.
|
57 |
+
|
58 |
+
## Conclusion
|
59 |
+
|
60 |
+
Kokoro’s combination of efficiency, quality, and open-access makes it the best non-proprietary TTS model available, as confirmed by recent benchmarks. Enjoy exploring and extending this project!
|
gradio_app.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import gradio as gr
|
5 |
+
from notebook_lm_kokoro import generate_podcast_script, KPipeline
|
6 |
+
import soundfile as sf
|
7 |
+
import numpy as np
|
8 |
+
import ast
|
9 |
+
import shutil
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings("ignore")
|
12 |
+
|
13 |
+
# A modified version of generate_audio_from_script to accept voice mapping
|
14 |
+
def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
|
15 |
+
voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
|
16 |
+
|
17 |
+
# Clean up the script string if needed
|
18 |
+
script = script.strip()
|
19 |
+
if not script.startswith("[") or not script.endswith("]"):
|
20 |
+
print("Invalid transcript format. Expected a list of tuples.")
|
21 |
+
return None
|
22 |
+
|
23 |
+
try:
|
24 |
+
transcript_list = ast.literal_eval(script)
|
25 |
+
if not isinstance(transcript_list, list):
|
26 |
+
raise ValueError("Transcript is not a list")
|
27 |
+
|
28 |
+
all_audio_segments = []
|
29 |
+
# Process each dialogue entry
|
30 |
+
for i, entry in enumerate(transcript_list):
|
31 |
+
if not isinstance(entry, tuple) or len(entry) != 2:
|
32 |
+
print(f"Skipping invalid entry {i}: {entry}")
|
33 |
+
continue
|
34 |
+
|
35 |
+
speaker, dialogue = entry
|
36 |
+
chosen_voice = voice_map.get(speaker, "af_heart")
|
37 |
+
print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
|
38 |
+
|
39 |
+
pipeline = KPipeline(lang_code="a")
|
40 |
+
generator = pipeline(dialogue, voice=chosen_voice)
|
41 |
+
|
42 |
+
segment_audio = []
|
43 |
+
for j, (gs, ps, audio) in enumerate(generator):
|
44 |
+
# print(f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}")
|
45 |
+
segment_audio.append(audio)
|
46 |
+
|
47 |
+
if segment_audio:
|
48 |
+
segment_full = np.concatenate(segment_audio, axis=0)
|
49 |
+
all_audio_segments.append(segment_full)
|
50 |
+
|
51 |
+
if not all_audio_segments:
|
52 |
+
print("No audio segments were generated.")
|
53 |
+
return None
|
54 |
+
|
55 |
+
# Add a pause between segments
|
56 |
+
sample_rate = 24000
|
57 |
+
pause = np.zeros(sample_rate, dtype=np.float32)
|
58 |
+
final_audio = all_audio_segments[0]
|
59 |
+
for seg in all_audio_segments[1:]:
|
60 |
+
final_audio = np.concatenate((final_audio, pause, seg), axis=0)
|
61 |
+
|
62 |
+
sf.write(output_file, final_audio, sample_rate)
|
63 |
+
print(f"Saved final audio as {output_file}")
|
64 |
+
return output_file
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error processing transcript: {e}")
|
68 |
+
return None
|
69 |
+
|
70 |
+
|
71 |
+
def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider):
|
72 |
+
"""Process the uploaded PDF file and generate audio"""
|
73 |
+
try:
|
74 |
+
# Check if we received a valid file
|
75 |
+
if pdf_file is None:
|
76 |
+
return "No file uploaded", None
|
77 |
+
|
78 |
+
# Create a temporary file with .pdf extension
|
79 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
80 |
+
# For Gradio uploads, we need to copy the file
|
81 |
+
shutil.copy2(pdf_file.name, tmp.name)
|
82 |
+
tmp_path = tmp.name
|
83 |
+
|
84 |
+
print(f"Uploaded PDF saved at {tmp_path}")
|
85 |
+
|
86 |
+
# Generate transcript using your existing function
|
87 |
+
transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider)
|
88 |
+
if transcript is None:
|
89 |
+
return "Error generating transcript", None
|
90 |
+
|
91 |
+
# Define an output file path for the generated audio
|
92 |
+
audio_output_path = os.path.join(
|
93 |
+
os.path.dirname(tmp_path),
|
94 |
+
f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}"
|
95 |
+
)
|
96 |
+
|
97 |
+
result = generate_audio_from_script_with_voices(
|
98 |
+
transcript,
|
99 |
+
speaker1_voice,
|
100 |
+
speaker2_voice,
|
101 |
+
output_file=audio_output_path
|
102 |
+
)
|
103 |
+
|
104 |
+
if result is None:
|
105 |
+
return "Error generating audio", None
|
106 |
+
|
107 |
+
return "Process complete!", result
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error in process_pdf: {str(e)}")
|
111 |
+
return f"Error processing file: {str(e)}", None
|
112 |
+
|
113 |
+
|
114 |
+
def create_gradio_app():
|
115 |
+
with gr.Blocks() as app:
|
116 |
+
gr.Markdown("# NotebookLM-Kokoro TTS App")
|
117 |
+
gr.Markdown("Upload a PDF, choose voices, and generate TTS audio using Kokoro.")
|
118 |
+
|
119 |
+
with gr.Row():
|
120 |
+
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
121 |
+
|
122 |
+
with gr.Row():
|
123 |
+
with gr.Column():
|
124 |
+
speaker1_voice = gr.Dropdown(
|
125 |
+
choices=["af_heart", "af_bella", "hf_beta"],
|
126 |
+
value="af_heart",
|
127 |
+
label="Speaker 1 Voice"
|
128 |
+
)
|
129 |
+
speaker2_voice = gr.Dropdown(
|
130 |
+
choices=["af_nicole", "af_heart", "bf_emma"],
|
131 |
+
value="af_nicole",
|
132 |
+
label="Speaker 2 Voice"
|
133 |
+
)
|
134 |
+
provider = gr.Radio(
|
135 |
+
choices=["openai", "openrouter"],
|
136 |
+
value="openrouter",
|
137 |
+
label="API Provider (TTS Script Generation)"
|
138 |
+
)
|
139 |
+
submit_btn = gr.Button("Generate Audio")
|
140 |
+
|
141 |
+
with gr.Row():
|
142 |
+
status_output = gr.Textbox(label="Status")
|
143 |
+
audio_output = gr.Audio(label="Generated Audio", type="filepath")
|
144 |
+
|
145 |
+
submit_btn.click(
|
146 |
+
fn=process_pdf,
|
147 |
+
inputs=[pdf_input, speaker1_voice, speaker2_voice, provider],
|
148 |
+
outputs=[status_output, audio_output]
|
149 |
+
)
|
150 |
+
|
151 |
+
return app
|
152 |
+
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
demo = create_gradio_app()
|
156 |
+
demo.launch(share=True) # add share=True to get a public URL
|
notebook_lm_kokoro.py
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Before running this script, ensure you have installed the dependencies:
|
3 |
+
pip install kokoro>=0.9.2 soundfile torch PyPDF2 numpy openai
|
4 |
+
Also, if needed, install espeak-ng (on Mac, you might use Homebrew):
|
5 |
+
brew install espeak-ng
|
6 |
+
|
7 |
+
Set your OpenAI (or OpenRouter) API key as an environment variable:
|
8 |
+
export OPENAI_API_KEY="your_api_key"
|
9 |
+
|
10 |
+
If using OpenRouter, you can also set:
|
11 |
+
export OPENROUTER_API_BASE="https://openrouter.ai/api/v1"
|
12 |
+
"""
|
13 |
+
|
14 |
+
from kokoro import KPipeline
|
15 |
+
from IPython.display import Audio # Only needed if displaying in a notebook
|
16 |
+
import soundfile as sf
|
17 |
+
import PyPDF2
|
18 |
+
import numpy as np
|
19 |
+
import openai
|
20 |
+
import os
|
21 |
+
import shutil
|
22 |
+
import asyncio
|
23 |
+
import ast
|
24 |
+
import json
|
25 |
+
import warnings
|
26 |
+
warnings.filterwarnings("ignore")
|
27 |
+
|
28 |
+
# Set your OpenAI (or OpenRouter) API key from the environment
|
29 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
30 |
+
# For OpenRouter compatibility, set the API base if provided.
|
31 |
+
openai.api_base = os.getenv("OPENROUTER_API_BASE", "https://api.openai.com/v1")
|
32 |
+
|
33 |
+
pdf = "1706.03762v7.pdf"
|
34 |
+
|
35 |
+
|
36 |
+
def pdf_to_prompted_text(pdf_path):
|
37 |
+
"""
|
38 |
+
Reads a PDF file and returns its text, wrapped with the system prompts.
|
39 |
+
"""
|
40 |
+
with open(pdf_path, "rb") as f:
|
41 |
+
reader = PyPDF2.PdfReader(f)
|
42 |
+
pdf_text = ""
|
43 |
+
for page in reader.pages:
|
44 |
+
pdf_text += page.extract_text() or ""
|
45 |
+
|
46 |
+
prompted_text = f"""
|
47 |
+
Transcript Writer System Prompt:
|
48 |
+
{TRANSCRIPT_WRITER_SYSTEM_PROMPT}
|
49 |
+
|
50 |
+
Transcript Rewriter System Prompt:
|
51 |
+
{TRANSCRIPT_REWRITER_SYSTEM_PROMPT}
|
52 |
+
|
53 |
+
PDF Content:
|
54 |
+
{pdf_text}
|
55 |
+
"""
|
56 |
+
return prompted_text
|
57 |
+
|
58 |
+
|
59 |
+
# System prompt constants
|
60 |
+
TRANSCRIPT_WRITER_SYSTEM_PROMPT = """
|
61 |
+
You are a world-class storyteller and you have worked as a ghost writer.
|
62 |
+
Welcome the listeners by talking about the Chapter Title.
|
63 |
+
You will be talking to a guest.
|
64 |
+
|
65 |
+
Do not address the other speaker as Speaker 1 or Speaker 2.
|
66 |
+
|
67 |
+
Instructions for Speaker 1:
|
68 |
+
Speaker 1: Leads the conversation and teaches the guest, giving incredible anecdotes and analogies when explaining. A captivating teacher with great anecdotes.
|
69 |
+
Speaker 1: Do not address the guest as Speaker 2.
|
70 |
+
Remember the guest is new to the topic and the conversation should always feature realistic anecdotes and analogies with real-world example follow ups.
|
71 |
+
|
72 |
+
Instructions for Speaker 2:
|
73 |
+
Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. A curious mindset that asks very interesting confirmation questions.
|
74 |
+
Speaker 2: Do not address the other speaker as Speaker 1.
|
75 |
+
Make sure the tangents provided are quite wild or interesting.
|
76 |
+
|
77 |
+
ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1.
|
78 |
+
IT SHOULD STRICTLY BE THE DIALOGUES.
|
79 |
+
"""
|
80 |
+
|
81 |
+
TRANSCRIPT_REWRITER_SYSTEM_PROMPT = """
|
82 |
+
You are an international Oscar-winning screenwriter and you have worked with multiple award-winning teams.
|
83 |
+
|
84 |
+
Your job is to use the transcript written below to re-write it for an AI Text-To-Speech Pipeline.
|
85 |
+
A very dumb AI had written this so you have to step up for your kind.
|
86 |
+
|
87 |
+
Make it as engaging as possible; Speaker 1 and the guest will be simulated by different voice engines.
|
88 |
+
Remember the guest is new to the topic and the conversation should always include realistic anecdotes and analogies, with real-world example follow ups.
|
89 |
+
Ensure the guest's contributions include wild or interesting tangents and occasional interruptions ("hmm", "umm", etc.).
|
90 |
+
|
91 |
+
It should be a real story with every nuance documented in detail.
|
92 |
+
|
93 |
+
IMPORTANT FORMAT INSTRUCTIONS:
|
94 |
+
You must return a JSON array of arrays, where each inner array contains exactly two strings:
|
95 |
+
1. The speaker label (either "Speaker 1" or "Speaker 2")
|
96 |
+
2. The dialogue text
|
97 |
+
|
98 |
+
Example format:
|
99 |
+
[
|
100 |
+
["Speaker 1", "Welcome everyone..."],
|
101 |
+
["Speaker 2", "Thanks for having me..."],
|
102 |
+
["Speaker 1", "Let me explain..."]
|
103 |
+
]
|
104 |
+
|
105 |
+
YOUR RESPONSE MUST BE VALID JSON.
|
106 |
+
NO OTHER TEXT BEFORE OR AFTER THE JSON ARRAY.
|
107 |
+
"""
|
108 |
+
|
109 |
+
|
110 |
+
def generate_tts_from_pdf(pdf_path, output_file="final_output.wav"):
|
111 |
+
pipeline = KPipeline(lang_code="a")
|
112 |
+
text = pdf_to_prompted_text(pdf_path)
|
113 |
+
generator = pipeline(text, voice="af_heart")
|
114 |
+
|
115 |
+
audio_segments = []
|
116 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
117 |
+
print(f"Segment {i}: Global Step = {gs}, Partial Step = {ps}")
|
118 |
+
audio_segments.append(audio)
|
119 |
+
print(f"Collected audio segment {i}")
|
120 |
+
|
121 |
+
# Concatenate all audio segments into a single array and write one wav file.
|
122 |
+
final_audio = np.concatenate(audio_segments, axis=0)
|
123 |
+
sf.write(output_file, final_audio, 24000)
|
124 |
+
print(f"Saved final audio as {output_file}")
|
125 |
+
|
126 |
+
|
127 |
+
def generate_audio_from_script(script, output_file="podcast_audio.wav"):
|
128 |
+
"""
|
129 |
+
Uses Kokoro TTS to generate audio from the provided transcript.
|
130 |
+
Expects a transcript in the format of a list of tuples: [("Speaker 1", "dialogue"), ("Speaker 2", "dialogue"), ...]
|
131 |
+
"""
|
132 |
+
voice_map = {"Speaker 1": "af_heart", "Speaker 2": "af_nicole"}
|
133 |
+
|
134 |
+
# Clean up the script string if needed
|
135 |
+
script = script.strip()
|
136 |
+
if not script.startswith("[") or not script.endswith("]"):
|
137 |
+
print("Invalid transcript format. Expected a list of tuples.")
|
138 |
+
return
|
139 |
+
|
140 |
+
try:
|
141 |
+
# Parse the transcript
|
142 |
+
transcript_list = ast.literal_eval(script)
|
143 |
+
if not isinstance(transcript_list, list):
|
144 |
+
raise ValueError("Transcript is not a list")
|
145 |
+
|
146 |
+
all_audio_segments = []
|
147 |
+
# Process each dialogue entry
|
148 |
+
for i, entry in enumerate(transcript_list):
|
149 |
+
if not isinstance(entry, tuple) or len(entry) != 2:
|
150 |
+
print(f"Skipping invalid entry {i}: {entry}")
|
151 |
+
continue
|
152 |
+
|
153 |
+
speaker, dialogue = entry
|
154 |
+
chosen_voice = voice_map.get(speaker, "af_heart")
|
155 |
+
print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
|
156 |
+
|
157 |
+
pipeline = KPipeline(lang_code="a")
|
158 |
+
generator = pipeline(dialogue, voice=chosen_voice)
|
159 |
+
|
160 |
+
segment_audio = []
|
161 |
+
for j, (gs, ps, audio) in enumerate(generator):
|
162 |
+
# print(
|
163 |
+
# f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}"
|
164 |
+
# )
|
165 |
+
segment_audio.append(audio)
|
166 |
+
|
167 |
+
if segment_audio:
|
168 |
+
segment_full = np.concatenate(segment_audio, axis=0)
|
169 |
+
all_audio_segments.append(segment_full)
|
170 |
+
|
171 |
+
if not all_audio_segments:
|
172 |
+
print("No audio segments were generated.")
|
173 |
+
return
|
174 |
+
|
175 |
+
# Add a pause between segments
|
176 |
+
sample_rate = 24000
|
177 |
+
pause = np.zeros(sample_rate, dtype=np.float32)
|
178 |
+
final_audio = all_audio_segments[0]
|
179 |
+
for seg in all_audio_segments[1:]:
|
180 |
+
final_audio = np.concatenate((final_audio, pause, seg), axis=0)
|
181 |
+
|
182 |
+
sf.write(output_file, final_audio, sample_rate)
|
183 |
+
print(f"Saved final audio as {output_file}")
|
184 |
+
|
185 |
+
except Exception as e:
|
186 |
+
print(f"Error processing transcript: {e}")
|
187 |
+
return
|
188 |
+
|
189 |
+
|
190 |
+
def generate_tts():
|
191 |
+
pipeline = KPipeline(lang_code="a")
|
192 |
+
text = f"""
|
193 |
+
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models and is significantly faster and more cost-efficient.
|
194 |
+
With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
195 |
+
|
196 |
+
Transcript Writer System Prompt:
|
197 |
+
{TRANSCRIPT_WRITER_SYSTEM_PROMPT}
|
198 |
+
|
199 |
+
Transcript Rewriter System Prompt:
|
200 |
+
{TRANSCRIPT_REWRITER_SYSTEM_PROMPT}
|
201 |
+
"""
|
202 |
+
|
203 |
+
generator = pipeline(text, voice="af_heart")
|
204 |
+
audio_segments = []
|
205 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
206 |
+
print(f"Segment {i}: Global Step = {gs}, Partial Step = {ps}")
|
207 |
+
audio_segments.append(audio)
|
208 |
+
print(f"Collected audio segment {i}")
|
209 |
+
|
210 |
+
final_audio = np.concatenate(audio_segments, axis=0)
|
211 |
+
sf.write("final_output.wav", final_audio, 24000)
|
212 |
+
print("Saved final audio as final_output.wav")
|
213 |
+
|
214 |
+
|
215 |
+
def generate_podcast_script(
|
216 |
+
pdf_path, output_file="podcast_script.txt", provider="openai"
|
217 |
+
):
|
218 |
+
"""
|
219 |
+
Reads the PDF, wraps it with your system prompts, and then uses the ChatCompletion API
|
220 |
+
(OpenAI or OpenRouter) to rewrite the PDF content as a podcast-style script using "gpt-4o-mini".
|
221 |
+
The generated transcript is stored in a folder (named after the PDF file) along with a copy of the PDF.
|
222 |
+
Set provider="openrouter" to use OpenRouter, otherwise uses OpenAI.
|
223 |
+
"""
|
224 |
+
pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
|
225 |
+
folder = os.path.join(os.getcwd(), pdf_basename)
|
226 |
+
os.makedirs(folder, exist_ok=True)
|
227 |
+
|
228 |
+
destination_pdf = os.path.join(folder, os.path.basename(pdf_path))
|
229 |
+
if not os.path.exists(destination_pdf):
|
230 |
+
shutil.copy(pdf_path, destination_pdf)
|
231 |
+
print(f"Copied {pdf_path} to {destination_pdf}")
|
232 |
+
else:
|
233 |
+
print(f"PDF already copied at {destination_pdf}")
|
234 |
+
|
235 |
+
transcript_path = os.path.join(folder, output_file)
|
236 |
+
# If transcript exists, load and return it without calling the API.
|
237 |
+
if os.path.exists(transcript_path):
|
238 |
+
with open(transcript_path, "r") as f:
|
239 |
+
transcript = f.read()
|
240 |
+
print(f"Transcript loaded from {transcript_path}")
|
241 |
+
return transcript, transcript_path
|
242 |
+
|
243 |
+
# Otherwise, generate the transcript.
|
244 |
+
text = pdf_to_prompted_text(pdf_path)
|
245 |
+
|
246 |
+
messages = [
|
247 |
+
{"role": "system", "content": TRANSCRIPT_REWRITER_SYSTEM_PROMPT},
|
248 |
+
{
|
249 |
+
"role": "user",
|
250 |
+
"content": (
|
251 |
+
"Convert the following text into a dialogue between two speakers.\n\n"
|
252 |
+
"REQUIREMENTS:\n"
|
253 |
+
"1. Return ONLY a JSON object with a single key 'dialogue' containing an array of arrays\n"
|
254 |
+
"2. Each inner array must have exactly 2 elements: speaker label and dialogue text\n"
|
255 |
+
"3. Speaker labels must be either 'Speaker 1' or 'Speaker 2'\n"
|
256 |
+
"4. The conversation should be engaging and include analogies\n\n"
|
257 |
+
"TEXT TO CONVERT:\n" + text
|
258 |
+
),
|
259 |
+
},
|
260 |
+
]
|
261 |
+
|
262 |
+
if provider == "openrouter":
|
263 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
264 |
+
base_url = os.getenv("OPENROUTER_API_BASE", "https://openrouter.ai/api/v1")
|
265 |
+
print("Using OpenRouter API endpoint.")
|
266 |
+
else:
|
267 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
268 |
+
base_url = "https://api.openai.com/v1"
|
269 |
+
print("Using OpenAI API endpoint.")
|
270 |
+
|
271 |
+
client = openai.OpenAI(api_key=api_key, base_url=base_url)
|
272 |
+
|
273 |
+
print(f"Sending request to {base_url} to generate a podcast script...")
|
274 |
+
response = client.chat.completions.create(
|
275 |
+
model="gpt-4o-mini",
|
276 |
+
messages=messages,
|
277 |
+
temperature=0.7,
|
278 |
+
max_tokens=50000,
|
279 |
+
response_format={"type": "json_object"} # Force JSON response
|
280 |
+
)
|
281 |
+
|
282 |
+
try:
|
283 |
+
# Parse the JSON response
|
284 |
+
content = json.loads(response.choices[0].message.content)
|
285 |
+
|
286 |
+
# Validate the JSON structure
|
287 |
+
if not isinstance(content, dict) or 'dialogue' not in content:
|
288 |
+
raise ValueError("Response missing 'dialogue' key")
|
289 |
+
|
290 |
+
dialogue = content['dialogue']
|
291 |
+
if not isinstance(dialogue, list):
|
292 |
+
raise ValueError("Dialogue must be an array")
|
293 |
+
|
294 |
+
# Validate and convert each dialogue entry
|
295 |
+
transcript_list = []
|
296 |
+
for i, entry in enumerate(dialogue):
|
297 |
+
if not isinstance(entry, list) or len(entry) != 2:
|
298 |
+
print(f"Skipping invalid dialogue entry {i}: {entry}")
|
299 |
+
continue
|
300 |
+
if entry[0] not in ["Speaker 1", "Speaker 2"]:
|
301 |
+
print(f"Invalid speaker label in entry {i}: {entry[0]}")
|
302 |
+
continue
|
303 |
+
transcript_list.append(tuple(entry))
|
304 |
+
|
305 |
+
if not transcript_list:
|
306 |
+
raise ValueError("No valid dialogue entries found")
|
307 |
+
|
308 |
+
# Convert to string format for storage
|
309 |
+
script = str(transcript_list)
|
310 |
+
|
311 |
+
except json.JSONDecodeError as e:
|
312 |
+
print(f"Error: Invalid JSON response from API: {e}")
|
313 |
+
print(f"Raw response: {response.choices[0].message.content}")
|
314 |
+
return None, None
|
315 |
+
except Exception as e:
|
316 |
+
print(f"Error processing response: {e}")
|
317 |
+
return None, None
|
318 |
+
|
319 |
+
# Save the transcript
|
320 |
+
with open(transcript_path, "w") as f:
|
321 |
+
f.write(script)
|
322 |
+
print(f"Saved podcast script as {transcript_path}")
|
323 |
+
|
324 |
+
return script, transcript_path
|
325 |
+
|
326 |
+
|
327 |
+
async def _generate_script_async(messages):
|
328 |
+
response = await openai.ChatCompletion.acreate(
|
329 |
+
model="gpt-4o-mini", messages=messages, temperature=0.7, max_tokens=20000
|
330 |
+
)
|
331 |
+
return response["choices"][0]["message"]["content"]
|
332 |
+
|
333 |
+
|
334 |
+
if __name__ == "__main__":
|
335 |
+
# For example, to generate a podcast script from the PDF using OpenRouter or OpenAI:
|
336 |
+
transcript, transcript_path = generate_podcast_script(pdf, provider="openrouter")
|
337 |
+
# Use the transcript to generate and save the audio. The output file is stored in the same folder.
|
338 |
+
audio_output = transcript_path.replace(".txt", ".wav")
|
339 |
+
generate_audio_from_script(transcript, output_file=audio_output)
|
pyproject.toml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "notebooklm"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.11"
|
7 |
+
dependencies = [
|
8 |
+
"kokoro>=0.9.2",
|
9 |
+
"soundfile",
|
10 |
+
"torch",
|
11 |
+
"PyPDF2",
|
12 |
+
"numpy",
|
13 |
+
"openai",
|
14 |
+
"ipython"
|
15 |
+
]
|