Spaces:
Sleeping
Sleeping
File size: 5,665 Bytes
35cda3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import streamlit as st
import os
import json
import shutil
import re
import requests
import pyttsx3
from pydub import AudioSegment
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Streamlit configuration
st.set_page_config(page_title="Podcast Generator", layout="wide")
st.title("🎙️ Podcast Generator")
# System prompt for conversation generation
system_prompt = """you are an experienced podcast host...
- based on text like an article you can create an engaging conversation between two people.
- make the conversation engaging with a lot of emotion.
- in the response, identify speakers as Sascha and Marina.
- Sascha is the writer, and Marina is the one asking questions.
- The podcast is called The Machine Learning Engineer.
- Short sentences that can be easily used with speech synthesis.
- Use natural conversation fillers like "äh" to make it sound real.
"""
# Load Hugging Face's distilgpt2 model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Pyttsx3 setup
engine = pyttsx3.init()
engine.setProperty("rate", 150) # Adjust speech rate as needed
engine.setProperty("voice", "english") # Set to English voice
# Retrieve ElevenLabs API key from environment
elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k"
elevenlabs_headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": elevenlabs_api_key
}
# ElevenLabs TTS function for Sascha
def synthesize_speech_elevenlabs(text, speaker, index):
data = {
"text": text,
"model_id": "eleven_turbo_v2_5",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}
response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers)
filename = f"audio-files/{index}_{speaker}.mp3"
with open(filename, "wb") as out:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
out.write(chunk)
# Pyttsx3 TTS function for Marina
def synthesize_speech_pyttsx3(text, speaker, index):
filename = f"audio-files/{index}_{speaker}.mp3"
engine.save_to_file(text, filename)
engine.runAndWait()
# Function to synthesize speech based on the speaker
def synthesize_speech(text, speaker, index):
if speaker == "Sascha":
synthesize_speech_elevenlabs(text, speaker, index)
else:
synthesize_speech_pyttsx3(text, speaker, index)
# Function to sort filenames naturally
def natural_sort_key(filename):
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
# Function to merge audio files
def merge_audios(audio_folder, output_file):
combined = AudioSegment.empty()
audio_files = sorted(
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
key=natural_sort_key
)
for filename in audio_files:
audio_path = os.path.join(audio_folder, filename)
audio = AudioSegment.from_file(audio_path)
combined += audio
combined.export(output_file, format="mp3")
# Function to generate the conversation using distilgpt2
def generate_conversation(article):
prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: "
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
# Process output to create a structured conversation
conversation_text = tokenizer.decode(output[0], skip_special_tokens=True)
lines = conversation_text.splitlines()
conversation = []
speaker = "Sascha"
for line in lines:
if line.strip():
conversation.append({"speaker": speaker, "text": line.strip()})
speaker = "Marina" if speaker == "Sascha" else "Sascha"
return conversation
# Function to generate the podcast audio from conversation data
def generate_audio(conversation):
if os.path.exists('audio-files'):
shutil.rmtree('audio-files')
os.makedirs('audio-files', exist_ok=True)
for index, part in enumerate(conversation):
speaker = part['speaker']
text = part['text']
synthesize_speech(text, speaker, index)
output_file = "podcast.mp3"
merge_audios("audio-files", output_file)
return output_file
# Streamlit inputs and outputs
article = st.text_area("Article Content", "Paste the article text here", height=300)
if st.button("Generate Podcast"):
if not article:
st.error("Please enter article content to generate a podcast.")
else:
with st.spinner("Generating conversation..."):
conversation = generate_conversation(article)
st.success("Conversation generated successfully!")
st.json(conversation)
# Generate audio files
with st.spinner("Synthesizing audio..."):
podcast_file = generate_audio(conversation)
st.success("Audio synthesis complete!")
st.audio(podcast_file, format="audio/mp3")
with open(podcast_file, "rb") as file:
st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3")
|