Pijush2023 commited on
Commit
ef3c87d
·
verified ·
1 Parent(s): cad8a30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -15
app.py CHANGED
@@ -755,18 +755,87 @@ repo_id = "parler-tts/parler-tts-mini-v1"
755
 
756
 
757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
758
  def generate_audio_parler_tts(text):
759
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
760
- chunk_size_in_s = 0.5
 
761
 
762
  # Initialize the tokenizer and model
763
  parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
764
  parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 
765
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
766
  frame_rate = parler_model.audio_encoder.config.frame_rate
 
 
767
 
768
- def generate(text, description, play_steps_in_s=0.5):
769
- play_steps = int(frame_rate * play_steps_in_s)
770
  streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
771
 
772
  inputs = parler_tokenizer(description, return_tensors="pt").to(device)
@@ -779,7 +848,7 @@ def generate_audio_parler_tts(text):
779
  prompt_attention_mask=prompt.attention_mask,
780
  streamer=streamer,
781
  do_sample=True,
782
- temperature=1.0,
783
  min_new_tokens=10,
784
  )
785
 
@@ -789,18 +858,27 @@ def generate_audio_parler_tts(text):
789
  for new_audio in streamer:
790
  if new_audio.shape[0] == 0:
791
  break
792
- # Save or process each audio chunk as it is generated
793
  yield sampling_rate, new_audio
794
 
795
- audio_segments = []
796
- for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
- audio_segments.append(audio_chunk)
798
- # Here, you can save the chunk to a file or send it to a frontend
799
- # For example, you could write the chunk to a file immediately:
800
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
801
- write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
802
- logging.debug(f"Saved chunk to {temp_audio_path}")
803
- # You could also send the chunk to a web client if this was a web application
 
 
 
 
 
 
 
 
 
 
804
 
805
  # Combine all the audio chunks into one audio file
806
  combined_audio = np.concatenate(audio_segments)
@@ -816,7 +894,6 @@ def generate_audio_parler_tts(text):
816
 
817
 
818
 
819
-
820
  def fetch_local_events():
821
  api_key = os.environ['SERP_API']
822
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
 
755
 
756
 
757
 
758
+ # def generate_audio_parler_tts(text):
759
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
760
+ # chunk_size_in_s = 0.5
761
+
762
+ # # Initialize the tokenizer and model
763
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
764
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
765
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
766
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
767
+
768
+ # def generate(text, description, play_steps_in_s=0.5):
769
+ # play_steps = int(frame_rate * play_steps_in_s)
770
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
771
+
772
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
773
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
774
+
775
+ # generation_kwargs = dict(
776
+ # input_ids=inputs.input_ids,
777
+ # prompt_input_ids=prompt.input_ids,
778
+ # attention_mask=inputs.attention_mask,
779
+ # prompt_attention_mask=prompt.attention_mask,
780
+ # streamer=streamer,
781
+ # do_sample=True,
782
+ # temperature=1.0,
783
+ # min_new_tokens=10,
784
+ # )
785
+
786
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
787
+ # thread.start()
788
+
789
+ # for new_audio in streamer:
790
+ # if new_audio.shape[0] == 0:
791
+ # break
792
+ # # Save or process each audio chunk as it is generated
793
+ # yield sampling_rate, new_audio
794
+
795
+ # audio_segments = []
796
+ # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
+ # audio_segments.append(audio_chunk)
798
+ # # Here, you can save the chunk to a file or send it to a frontend
799
+ # # For example, you could write the chunk to a file immediately:
800
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
801
+ # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
802
+ # logging.debug(f"Saved chunk to {temp_audio_path}")
803
+ # # You could also send the chunk to a web client if this was a web application
804
+
805
+ # # Combine all the audio chunks into one audio file
806
+ # combined_audio = np.concatenate(audio_segments)
807
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
808
+
809
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
810
+
811
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
812
+ # return combined_audio_path
813
+
814
+ import concurrent.futures
815
+ import tempfile
816
+ import os
817
+ import numpy as np
818
+ from threading import Thread
819
+ from transformers import AutoTokenizer
820
+ from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
821
+ from scipy.io.wavfile import write as write_wav
822
+ import logging
823
+
824
  def generate_audio_parler_tts(text):
825
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
826
+
827
+ chunk_size_in_s = 0.3 # Smaller chunk size for lower latency
828
 
829
  # Initialize the tokenizer and model
830
  parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
831
  parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
832
+
833
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
834
  frame_rate = parler_model.audio_encoder.config.frame_rate
835
+
836
+ play_steps = int(frame_rate * chunk_size_in_s)
837
 
838
+ def generate_chunks(text, description):
 
839
  streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
840
 
841
  inputs = parler_tokenizer(description, return_tensors="pt").to(device)
 
848
  prompt_attention_mask=prompt.attention_mask,
849
  streamer=streamer,
850
  do_sample=True,
851
+ temperature=0.7, # Lower temperature for faster generation
852
  min_new_tokens=10,
853
  )
854
 
 
858
  for new_audio in streamer:
859
  if new_audio.shape[0] == 0:
860
  break
 
861
  yield sampling_rate, new_audio
862
 
863
+ def process_audio_chunks(chunks):
864
+ audio_segments = []
865
+ for sampling_rate, audio_chunk in chunks:
866
+ audio_segments.append(audio_chunk)
867
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
868
+ write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
869
+ logging.debug(f"Saved chunk to {temp_audio_path}")
870
+ # Optionally, send this chunk to the client in real-time
871
+ return audio_segments
872
+
873
+ with concurrent.futures.ThreadPoolExecutor() as executor:
874
+ # Start processing audio chunks in a separate thread
875
+ future_chunks = executor.submit(process_audio_chunks, generate_chunks(text, description))
876
+
877
+ # Continue with other tasks in parallel
878
+ # (e.g., you can update the chatbot interface, handle other requests, etc.)
879
+
880
+ # Wait for audio processing to complete and get the result
881
+ audio_segments = future_chunks.result()
882
 
883
  # Combine all the audio chunks into one audio file
884
  combined_audio = np.concatenate(audio_segments)
 
894
 
895
 
896
 
 
897
  def fetch_local_events():
898
  api_key = os.environ['SERP_API']
899
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'