Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -631,11 +631,55 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
|
631 |
from langchain.agents import Tool, initialize_agent
|
632 |
from huggingface_hub import login
|
633 |
|
634 |
-
|
635 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
|
637 |
-
# Call the function to install parler-tts
|
638 |
-
install_parler_tts()
|
639 |
|
640 |
# Check if the token is already set in the environment variables
|
641 |
hf_token = os.getenv("HF_TOKEN")
|
@@ -889,7 +933,7 @@ def bot(history, choice, tts_model):
|
|
889 |
if tts_model == "ElevenLabs":
|
890 |
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
891 |
else:
|
892 |
-
audio_future = executor.submit(
|
893 |
|
894 |
for character in response:
|
895 |
history[-1][1] += character
|
@@ -1109,50 +1153,6 @@ def generate_audio_elevenlabs(text):
|
|
1109 |
logging.error(f"Error generating audio: {response.text}")
|
1110 |
return None
|
1111 |
|
1112 |
-
def generate_audio_parler_tts(text):
|
1113 |
-
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
1114 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
1115 |
-
try:
|
1116 |
-
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
1117 |
-
except Exception as e:
|
1118 |
-
print(f"Error loading Parler TTS model: {e}")
|
1119 |
-
return None
|
1120 |
-
|
1121 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
1122 |
-
|
1123 |
-
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
1124 |
-
|
1125 |
-
try:
|
1126 |
-
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
1127 |
-
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
1128 |
-
except Exception as e:
|
1129 |
-
print(f"Error tokenizing input: {e}")
|
1130 |
-
return None
|
1131 |
-
|
1132 |
-
max_input_length = model.config.n_positions - input_ids.shape[1]
|
1133 |
-
segments = [prompt_input_ids[0][i:i+max_input_length] for i in range(0, prompt_input_ids.shape[1], max_input_length)]
|
1134 |
-
|
1135 |
-
audio_segments = []
|
1136 |
-
for segment in segments:
|
1137 |
-
segment = segment.unsqueeze(0)
|
1138 |
-
try:
|
1139 |
-
generation = model.generate(input_ids=input_ids, prompt_input_ids=segment)
|
1140 |
-
except Exception as e:
|
1141 |
-
print(f"Error generating audio segment: {e}")
|
1142 |
-
return None
|
1143 |
-
|
1144 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
1145 |
-
audio_segments.append(audio_arr)
|
1146 |
-
|
1147 |
-
full_audio = np.concatenate(audio_segments)
|
1148 |
-
|
1149 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
1150 |
-
sf.write(f.name, full_audio, model.config.sampling_rate)
|
1151 |
-
temp_audio_path = f.name
|
1152 |
-
|
1153 |
-
logging.debug(f"Audio saved to {temp_audio_path}")
|
1154 |
-
return temp_audio_path
|
1155 |
-
|
1156 |
# Stable Diffusion setup
|
1157 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
1158 |
pipe = pipe.to("cuda")
|
@@ -1185,7 +1185,7 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1185 |
|
1186 |
chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
|
1187 |
choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
|
1188 |
-
tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "
|
1189 |
|
1190 |
gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
|
1191 |
chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
|
@@ -1226,3 +1226,4 @@ demo.launch(share=True)
|
|
1226 |
|
1227 |
|
1228 |
|
|
|
|
631 |
from langchain.agents import Tool, initialize_agent
|
632 |
from huggingface_hub import login
|
633 |
|
634 |
+
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
|
635 |
+
from string import punctuation
|
636 |
+
|
637 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
638 |
+
|
639 |
+
repo_id = "parler-tts/parler-tts-mini-expresso"
|
640 |
+
|
641 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
642 |
+
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
643 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
644 |
+
|
645 |
+
SAMPLE_RATE = feature_extractor.sampling_rate
|
646 |
+
SEED = 42
|
647 |
+
|
648 |
+
number_normalizer = EnglishNumberNormalizer()
|
649 |
+
|
650 |
+
def preprocess(text):
|
651 |
+
text = number_normalizer(text).strip()
|
652 |
+
if text[-1] not in punctuation:
|
653 |
+
text = f"{text}."
|
654 |
+
|
655 |
+
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
|
656 |
+
|
657 |
+
def separate_abb(chunk):
|
658 |
+
chunk = chunk.replace(".", "")
|
659 |
+
print(chunk)
|
660 |
+
return " ".join(chunk)
|
661 |
+
|
662 |
+
abbreviations = re.findall(abbreviations_pattern, text)
|
663 |
+
for abv in abbreviations:
|
664 |
+
if abv in text:
|
665 |
+
text = text.replace(abv, separate_abb(abv))
|
666 |
+
return text
|
667 |
+
|
668 |
+
def generate_audio(text, description="Thomas speaks with emphasis and excitement at a moderate pace with high quality."):
|
669 |
+
inputs = tokenizer(description, return_tensors="pt").to(device)
|
670 |
+
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
|
671 |
+
|
672 |
+
set_seed(SEED)
|
673 |
+
generation = model.generate(input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids)
|
674 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
675 |
+
|
676 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
677 |
+
sf.write(f.name, audio_arr, SAMPLE_RATE)
|
678 |
+
temp_audio_path = f.name
|
679 |
+
|
680 |
+
logging.debug(f"Audio saved to {temp_audio_path}")
|
681 |
+
return temp_audio_path
|
682 |
|
|
|
|
|
683 |
|
684 |
# Check if the token is already set in the environment variables
|
685 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
933 |
if tts_model == "ElevenLabs":
|
934 |
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
935 |
else:
|
936 |
+
audio_future = executor.submit(generate_audio, response) # Updated function call
|
937 |
|
938 |
for character in response:
|
939 |
history[-1][1] += character
|
|
|
1153 |
logging.error(f"Error generating audio: {response.text}")
|
1154 |
return None
|
1155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1156 |
# Stable Diffusion setup
|
1157 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
1158 |
pipe = pipe.to("cuda")
|
|
|
1185 |
|
1186 |
chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
|
1187 |
choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
|
1188 |
+
tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "New TTS Model"], value="New TTS Model")
|
1189 |
|
1190 |
gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
|
1191 |
chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
|
|
|
1226 |
|
1227 |
|
1228 |
|
1229 |
+
|