Spaces:
Sleeping
Sleeping
File size: 3,841 Bytes
35cda3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import re
import json
import shutil
import pyttsx3
from pydub import AudioSegment
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize GPT-2 model and tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# System prompt and article content
system_prompt = """Generate a conversation between Sascha and Marina based on the article content provided.
Sascha is the article writer, and Marina is the interviewer. Make it engaging and emotional, with natural pauses (like "uh")
to make it sound conversational. This is for a podcast called "The Machine Learning Engineer"."""
# TTS voice map for Sascha and Marina
speaker_voice_map = {
"Sascha": "pyttsx3", # Sascha will use pyttsx3 for offline TTS
"Marina": "pyttsx3" # Marina uses pyttsx3 for offline TTS
}
# Initialize pyttsx3 engine for offline TTS
engine = pyttsx3.init()
engine.setProperty('rate', 150) # Speed of speech
engine.setProperty('volume', 0.9) # Volume (0.0 to 1.0)
# Pyttsx3 TTS function for offline TTS
def synthesize_speech_pyttsx3(text, speaker, index):
filename = f"audio-files/{index}_{speaker}.mp3"
engine.save_to_file(text, filename)
engine.runAndWait()
print(f'Audio content written to file "{filename}"')
# Function to synthesize speech based on the speaker
def synthesize_speech(text, speaker, index):
synthesize_speech_pyttsx3(text, speaker, index)
# Function to sort filenames naturally
def natural_sort_key(filename):
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
# Function to merge audio files
def merge_audios(audio_folder, output_file):
combined = AudioSegment.empty()
audio_files = sorted(
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
key=natural_sort_key
)
for filename in audio_files:
audio_path = os.path.join(audio_folder, filename)
print(f"Processing: {audio_path}")
audio = AudioSegment.from_file(audio_path)
combined += audio
combined.export(output_file, format="mp3")
print(f"Merged audio saved as {output_file}")
# Function to generate conversation using distilgpt2
def generate_conversation(article):
input_text = f"{system_prompt}\n\n{article}\n\nSascha: "
inputs = tokenizer.encode(input_text, return_tensors="pt")
outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=1.0)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Parse conversation into JSON format
lines = re.split(r'(Sascha:|Marina:)', generated_text)[1:] # split by speaker names
conversation = [{"speaker": lines[i].strip(), "text": lines[i + 1].strip()} for i in range(0, len(lines), 2)]
formatted_json = json.dumps(conversation, indent=4)
print(formatted_json)
return conversation
# Function to generate the podcast audio
def generate_audio(conversation):
if os.path.exists('audio-files'):
shutil.rmtree('audio-files')
os.makedirs('audio-files', exist_ok=True)
for index, part in enumerate(conversation):
speaker = part['speaker']
text = part['text']
synthesize_speech(text, speaker, index)
output_file = "podcast.mp3"
merge_audios("audio-files", output_file)
return output_file
# Read the article from the file
with open('function-calling.txt', 'r') as file:
article = file.read()
# Generate conversation and audio
conversation = generate_conversation(article)
generate_audio(conversation)
|