Spaces:

AdithyaSNair
/

AdithyaLM

Sleeping

File size: 5,665 Bytes

35cda3c

import streamlit as st
import os
import json
import shutil
import re
import requests
import pyttsx3
from pydub import AudioSegment
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Streamlit configuration
st.set_page_config(page_title="Podcast Generator", layout="wide")
st.title("🎙️ Podcast Generator")

# System prompt for conversation generation
system_prompt = """you are an experienced podcast host...

- based on text like an article you can create an engaging conversation between two people.

- make the conversation engaging with a lot of emotion.

- in the response, identify speakers as Sascha and Marina.

- Sascha is the writer, and Marina is the one asking questions.

- The podcast is called The Machine Learning Engineer.

- Short sentences that can be easily used with speech synthesis.

- Use natural conversation fillers like "äh" to make it sound real.

"""

# Load Hugging Face's distilgpt2 model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Pyttsx3 setup
engine = pyttsx3.init()
engine.setProperty("rate", 150)  # Adjust speech rate as needed
engine.setProperty("voice", "english")  # Set to English voice

# Retrieve ElevenLabs API key from environment
elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k"
elevenlabs_headers = {
    "Accept": "audio/mpeg",
    "Content-Type": "application/json",
    "xi-api-key": elevenlabs_api_key
}

# ElevenLabs TTS function for Sascha
def synthesize_speech_elevenlabs(text, speaker, index):
    data = {
        "text": text,
        "model_id": "eleven_turbo_v2_5",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75
        }
    }
    response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers)
    filename = f"audio-files/{index}_{speaker}.mp3"
    with open(filename, "wb") as out:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                out.write(chunk)

# Pyttsx3 TTS function for Marina
def synthesize_speech_pyttsx3(text, speaker, index):
    filename = f"audio-files/{index}_{speaker}.mp3"
    engine.save_to_file(text, filename)
    engine.runAndWait()

# Function to synthesize speech based on the speaker
def synthesize_speech(text, speaker, index):
    if speaker == "Sascha":
        synthesize_speech_elevenlabs(text, speaker, index)
    else:
        synthesize_speech_pyttsx3(text, speaker, index)

# Function to sort filenames naturally
def natural_sort_key(filename):
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]

# Function to merge audio files
def merge_audios(audio_folder, output_file):
    combined = AudioSegment.empty()
    audio_files = sorted(
        [f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
        key=natural_sort_key
    )
    for filename in audio_files:
        audio_path = os.path.join(audio_folder, filename)
        audio = AudioSegment.from_file(audio_path)
        combined += audio
    combined.export(output_file, format="mp3")

# Function to generate the conversation using distilgpt2
def generate_conversation(article):
    prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)

    # Process output to create a structured conversation
    conversation_text = tokenizer.decode(output[0], skip_special_tokens=True)
    lines = conversation_text.splitlines()
    conversation = []
    speaker = "Sascha"
    for line in lines:
        if line.strip():
            conversation.append({"speaker": speaker, "text": line.strip()})
            speaker = "Marina" if speaker == "Sascha" else "Sascha"
    return conversation

# Function to generate the podcast audio from conversation data
def generate_audio(conversation):
    if os.path.exists('audio-files'):
        shutil.rmtree('audio-files')
    os.makedirs('audio-files', exist_ok=True)
    
    for index, part in enumerate(conversation):
        speaker = part['speaker']
        text = part['text']
        synthesize_speech(text, speaker, index)
    
    output_file = "podcast.mp3"
    merge_audios("audio-files", output_file)
    return output_file

# Streamlit inputs and outputs
article = st.text_area("Article Content", "Paste the article text here", height=300)
if st.button("Generate Podcast"):
    if not article:
        st.error("Please enter article content to generate a podcast.")
    else:
        with st.spinner("Generating conversation..."):
            conversation = generate_conversation(article)
        
        st.success("Conversation generated successfully!")
        st.json(conversation)
        
        # Generate audio files
        with st.spinner("Synthesizing audio..."):
            podcast_file = generate_audio(conversation)
        
        st.success("Audio synthesis complete!")
        st.audio(podcast_file, format="audio/mp3")
        
        with open(podcast_file, "rb") as file:
            st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3")