Pijush2023 commited on
Commit
3bac656
·
verified ·
1 Parent(s): 22f3c9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -50
app.py CHANGED
@@ -631,11 +631,55 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
631
  from langchain.agents import Tool, initialize_agent
632
  from huggingface_hub import login
633
 
634
- def install_parler_tts():
635
- subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/huggingface/parler-tts.git"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
- # Call the function to install parler-tts
638
- install_parler_tts()
639
 
640
  # Check if the token is already set in the environment variables
641
  hf_token = os.getenv("HF_TOKEN")
@@ -889,7 +933,7 @@ def bot(history, choice, tts_model):
889
  if tts_model == "ElevenLabs":
890
  audio_future = executor.submit(generate_audio_elevenlabs, response)
891
  else:
892
- audio_future = executor.submit(generate_audio_parler_tts, response)
893
 
894
  for character in response:
895
  history[-1][1] += character
@@ -1109,50 +1153,6 @@ def generate_audio_elevenlabs(text):
1109
  logging.error(f"Error generating audio: {response.text}")
1110
  return None
1111
 
1112
- def generate_audio_parler_tts(text):
1113
- model_id = 'parler-tts/parler_tts_mini_v0.1'
1114
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
1115
- try:
1116
- model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
1117
- except Exception as e:
1118
- print(f"Error loading Parler TTS model: {e}")
1119
- return None
1120
-
1121
- tokenizer = AutoTokenizer.from_pretrained(model_id)
1122
-
1123
- description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
1124
-
1125
- try:
1126
- input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
1127
- prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
1128
- except Exception as e:
1129
- print(f"Error tokenizing input: {e}")
1130
- return None
1131
-
1132
- max_input_length = model.config.n_positions - input_ids.shape[1]
1133
- segments = [prompt_input_ids[0][i:i+max_input_length] for i in range(0, prompt_input_ids.shape[1], max_input_length)]
1134
-
1135
- audio_segments = []
1136
- for segment in segments:
1137
- segment = segment.unsqueeze(0)
1138
- try:
1139
- generation = model.generate(input_ids=input_ids, prompt_input_ids=segment)
1140
- except Exception as e:
1141
- print(f"Error generating audio segment: {e}")
1142
- return None
1143
-
1144
- audio_arr = generation.cpu().numpy().squeeze()
1145
- audio_segments.append(audio_arr)
1146
-
1147
- full_audio = np.concatenate(audio_segments)
1148
-
1149
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
1150
- sf.write(f.name, full_audio, model.config.sampling_rate)
1151
- temp_audio_path = f.name
1152
-
1153
- logging.debug(f"Audio saved to {temp_audio_path}")
1154
- return temp_audio_path
1155
-
1156
  # Stable Diffusion setup
1157
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
1158
  pipe = pipe.to("cuda")
@@ -1185,7 +1185,7 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1185
 
1186
  chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
1187
  choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
1188
- tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
1189
 
1190
  gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
1191
  chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
@@ -1226,3 +1226,4 @@ demo.launch(share=True)
1226
 
1227
 
1228
 
 
 
631
  from langchain.agents import Tool, initialize_agent
632
  from huggingface_hub import login
633
 
634
+ from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
635
+ from string import punctuation
636
+
637
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
638
+
639
+ repo_id = "parler-tts/parler-tts-mini-expresso"
640
+
641
+ model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
642
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
643
+ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
644
+
645
+ SAMPLE_RATE = feature_extractor.sampling_rate
646
+ SEED = 42
647
+
648
+ number_normalizer = EnglishNumberNormalizer()
649
+
650
+ def preprocess(text):
651
+ text = number_normalizer(text).strip()
652
+ if text[-1] not in punctuation:
653
+ text = f"{text}."
654
+
655
+ abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
656
+
657
+ def separate_abb(chunk):
658
+ chunk = chunk.replace(".", "")
659
+ print(chunk)
660
+ return " ".join(chunk)
661
+
662
+ abbreviations = re.findall(abbreviations_pattern, text)
663
+ for abv in abbreviations:
664
+ if abv in text:
665
+ text = text.replace(abv, separate_abb(abv))
666
+ return text
667
+
668
+ def generate_audio(text, description="Thomas speaks with emphasis and excitement at a moderate pace with high quality."):
669
+ inputs = tokenizer(description, return_tensors="pt").to(device)
670
+ prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
671
+
672
+ set_seed(SEED)
673
+ generation = model.generate(input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids)
674
+ audio_arr = generation.cpu().numpy().squeeze()
675
+
676
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
677
+ sf.write(f.name, audio_arr, SAMPLE_RATE)
678
+ temp_audio_path = f.name
679
+
680
+ logging.debug(f"Audio saved to {temp_audio_path}")
681
+ return temp_audio_path
682
 
 
 
683
 
684
  # Check if the token is already set in the environment variables
685
  hf_token = os.getenv("HF_TOKEN")
 
933
  if tts_model == "ElevenLabs":
934
  audio_future = executor.submit(generate_audio_elevenlabs, response)
935
  else:
936
+ audio_future = executor.submit(generate_audio, response) # Updated function call
937
 
938
  for character in response:
939
  history[-1][1] += character
 
1153
  logging.error(f"Error generating audio: {response.text}")
1154
  return None
1155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1156
  # Stable Diffusion setup
1157
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
1158
  pipe = pipe.to("cuda")
 
1185
 
1186
  chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
1187
  choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
1188
+ tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "New TTS Model"], value="New TTS Model")
1189
 
1190
  gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
1191
  chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
 
1226
 
1227
 
1228
 
1229
+