Spaces:
Runtime error
Runtime error
File size: 6,797 Bytes
c6919c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import argparse
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
import os
import datetime
import soundfile as sf
import re
SUPPORTED_LANGS = [
("English", "en"),
("German", "de"),
("Spanish", "es"),
("French", "fr"),
("Hindi", "hi"),
("Italian", "it"),
("Japanese", "ja"),
("Korean", "ko"),
("Polish", "pl"),
("Portuguese", "pt"),
("Russian", "ru"),
("Turkish", "tr"),
("Chinese", "zh"),
]
ALLOWED_PROMPTS = set()
ALLOWED_PROMPTS = {"announcer"}
for _, lang in SUPPORTED_LANGS:
for n in range(10):
ALLOWED_PROMPTS.add(f"{lang}_speaker_{n}")
for n in range(10):
ALLOWED_PROMPTS.add(f"speaker_{n}")
def estimate_spoken_time(text, wpm=150, time_limit=14):
# Remove text within square brackets
text_without_brackets = re.sub(r'\[.*?\]', '', text)
words = text_without_brackets.split()
word_count = len(words)
time_in_seconds = (word_count / wpm) * 60
if time_in_seconds > time_limit:
return True, time_in_seconds
else:
return False, time_in_seconds
def save_audio_to_file(filename, audio_array, sample_rate=24000, format='WAV', subtype='PCM_16', output_dir=None):
# Create output directory if it doesn't exist
if output_dir:
os.makedirs(output_dir, exist_ok=True)
filepath = os.path.join(output_dir, filename)
else:
filepath = filename
i = 1
name, ext = os.path.splitext(filepath)
while os.path.exists(filepath):
filepath = f"{name}_{i}{ext}"
i += 1
sf.write(filepath, audio_array, sample_rate, format=format, subtype=subtype)
print(f"Saved audio to {filepath}")
def gen_and_save_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7, filename="", output_dir="bark_samples"):
def generate_unique_filename(base_filename):
name, ext = os.path.splitext(base_filename)
unique_filename = base_filename
counter = 1
while os.path.exists(unique_filename):
unique_filename = f"{name}_{counter}{ext}"
counter += 1
return unique_filename
longer_than_14_seconds, estimated_time = estimate_spoken_time(text_prompt)
print(f"Estimated time: {estimated_time:.2f} seconds.")
if longer_than_14_seconds:
print(f"Text Prompt could be too long, might want to try a shorter one if you get a bad result.")
print(f"Generating: {text_prompt}")
if args.history_prompt:
print(f"Using speaker: {history_prompt}")
else:
print(f"No speaker. Randomly generating a speaker.")
audio_array = generate_audio(text_prompt, history_prompt, text_temp=text_temp,
waveform_temp=waveform_temp)
if not filename:
date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H")
truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[:15].strip().replace(" ", "_")
filename = f"{truncated_text}-history_prompt-{history_prompt}-text_temp-{text_temp}-waveform_temp-{waveform_temp}-{date_str}.wav"
filename = generate_unique_filename(filename)
save_audio_to_file(filename, audio_array, SAMPLE_RATE, output_dir=output_dir)
def print_speakers_list():
print("Available history prompts:")
print("\nNon-specific speakers:")
print(" announcer")
print(" speaker_0 to speaker_9")
print("\nLanguage-specific speakers:")
for language, lang_code in SUPPORTED_LANGS:
speakers = ", ".join([f"{lang_code}_speaker_{n}" for n in range(10)])
print(f"\n {language}({lang_code}):\n{speakers}")
# If there's no text_prompt passed on the command line, process this list instead.
text_prompts = []
text_prompt = """
In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.
"""
text_prompts.append(text_prompt)
text_prompt = """
A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
"""
text_prompts.append(text_prompt)
def main(args):
if args.list_speakers:
print_speakers_list()
else:
if args.text_prompt:
text_prompts_to_process = [args.text_prompt]
else:
print("No text prompt provided. Using default prompts defined in this file.")
text_prompts_to_process = text_prompts
if args.history_prompt:
history_prompt = args.history_prompt
else:
history_prompt = None
text_temp = args.text_temp if args.text_temp else 0.7
waveform_temp = args.waveform_temp if args.waveform_temp else 0.7
filename = args.filename if args.filename else ""
output_dir = args.output_dir if args.output_dir else "bark_samples"
print("Loading Bark models...")
if args.use_smaller_models:
print("Using smaller models.")
preload_models(use_smaller_models=True)
else:
preload_models()
print("Models loaded.")
for prompt in text_prompts_to_process:
gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""
Generate and save audio.
install this first: pip install soundfile
Example: python bark_speak.py --text_prompt "It is a mistake to think you can solve any major problems just with potatoes." --history_prompt en_speaker_3
""", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.")
parser.add_argument("--history_prompt", help="Optional. Choose a speaker from the list of languages: " + ", ".join([lang[0] for lang in SUPPORTED_LANGS]) + ". Use --list_speakers to see all available options.")
parser.add_argument("--text_temp", type=float, help="Text temperature. Default is 0.7.")
parser.add_argument("--waveform_temp", type=float, help="Waveform temperature. Default is 0.7.")
parser.add_argument("--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.")
parser.add_argument("--output_dir", help="Output directory. Default is 'bark_samples'.")
parser.add_argument("--list_speakers", action="store_true", help="List all preset speaker options instead of generating audio.")
parser.add_argument("--use_smaller_models", action="store_true", help="Use for GPUS with less than 10GB of memory, or for more speed.")
args = parser.parse_args()
main(args)
|