Pijush2023 commited on
Commit
a17fdc2
·
verified ·
1 Parent(s): 637dfac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -101
app.py CHANGED
@@ -746,84 +746,14 @@ def generate_audio_elevenlabs(text):
746
  return None
747
 
748
 
749
- # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
- # from transformers import AutoTokenizer
751
- # from threading import Thread
752
-
753
- # repo_id = "parler-tts/parler-tts-mini-v1"
754
-
755
-
756
-
757
-
758
- # def generate_audio_parler_tts(text):
759
- # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
760
- # chunk_size_in_s = 0.5
761
-
762
- # # Initialize the tokenizer and model
763
- # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
764
- # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
765
- # sampling_rate = parler_model.audio_encoder.config.sampling_rate
766
- # frame_rate = parler_model.audio_encoder.config.frame_rate
767
-
768
- # def generate(text, description, play_steps_in_s=0.5):
769
- # play_steps = int(frame_rate * play_steps_in_s)
770
- # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
771
-
772
- # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
773
- # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
774
-
775
- # generation_kwargs = dict(
776
- # input_ids=inputs.input_ids,
777
- # prompt_input_ids=prompt.input_ids,
778
- # attention_mask=inputs.attention_mask,
779
- # prompt_attention_mask=prompt.attention_mask,
780
- # streamer=streamer,
781
- # do_sample=True,
782
- # temperature=1.0,
783
- # min_new_tokens=10,
784
- # )
785
-
786
- # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
787
- # thread.start()
788
-
789
- # for new_audio in streamer:
790
- # if new_audio.shape[0] == 0:
791
- # break
792
- # # Save or process each audio chunk as it is generated
793
- # yield sampling_rate, new_audio
794
-
795
- # audio_segments = []
796
- # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
- # audio_segments.append(audio_chunk)
798
-
799
- # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
800
- # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
801
- # logging.debug(f"Saved chunk to {temp_audio_path}")
802
-
803
-
804
- # # Combine all the audio chunks into one audio file
805
- # combined_audio = np.concatenate(audio_segments)
806
- # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
807
-
808
- # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
809
-
810
- # logging.debug(f"Combined audio saved to {combined_audio_path}")
811
- # return combined_audio_path
812
-
813
-
814
- import concurrent.futures
815
- import tempfile
816
- import os
817
- import numpy as np
818
- import logging
819
- from queue import Queue
820
- from threading import Thread
821
- from scipy.io.wavfile import write as write_wav
822
  from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
823
  from transformers import AutoTokenizer
 
824
 
825
  repo_id = "parler-tts/parler-tts-mini-v1"
826
- device = "cuda:0" # or "cpu" if CUDA is not available
 
 
827
 
828
  def generate_audio_parler_tts(text):
829
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
@@ -859,39 +789,23 @@ def generate_audio_parler_tts(text):
859
  for new_audio in streamer:
860
  if new_audio.shape[0] == 0:
861
  break
 
862
  yield sampling_rate, new_audio
863
 
864
- # Queue to hold the audio chunks
865
- audio_queue = Queue()
866
- combined_audio = []
867
-
868
- def process_chunks():
869
- while True:
870
- sampling_rate, audio_chunk = audio_queue.get()
871
- if audio_chunk is None: # Stop processing when a None is received
872
- break
873
- combined_audio.append(audio_chunk)
874
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(combined_audio)}.wav")
875
- write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
876
- logging.debug(f"Saved chunk to {temp_audio_path}")
877
- # Start playing or buffering the audio chunk here if required
878
- # (e.g., send to a player or a frontend for immediate playback)
879
-
880
- # Start the chunk processing in a separate thread
881
- with concurrent.futures.ThreadPoolExecutor() as executor:
882
- executor.submit(process_chunks)
883
-
884
- for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
885
- audio_queue.put((sampling_rate, audio_chunk))
886
-
887
- # Signal the end of processing
888
- audio_queue.put((None, None))
889
 
890
  # Combine all the audio chunks into one audio file
891
- combined_audio_np = np.concatenate(combined_audio)
892
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
893
 
894
- write_wav(combined_audio_path, sampling_rate, combined_audio_np.astype(np.float32))
895
 
896
  logging.debug(f"Combined audio saved to {combined_audio_path}")
897
  return combined_audio_path
@@ -901,6 +815,8 @@ def generate_audio_parler_tts(text):
901
 
902
 
903
 
 
 
904
  def fetch_local_events():
905
  api_key = os.environ['SERP_API']
906
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
 
746
  return None
747
 
748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749
  from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
  from transformers import AutoTokenizer
751
+ from threading import Thread
752
 
753
  repo_id = "parler-tts/parler-tts-mini-v1"
754
+
755
+
756
+
757
 
758
  def generate_audio_parler_tts(text):
759
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
 
789
  for new_audio in streamer:
790
  if new_audio.shape[0] == 0:
791
  break
792
+ # Save or process each audio chunk as it is generated
793
  yield sampling_rate, new_audio
794
 
795
+ audio_segments = []
796
+ for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
+ audio_segments.append(audio_chunk)
798
+
799
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
800
+ write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
801
+ logging.debug(f"Saved chunk to {temp_audio_path}")
802
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
  # Combine all the audio chunks into one audio file
805
+ combined_audio = np.concatenate(audio_segments)
806
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
807
 
808
+ write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
809
 
810
  logging.debug(f"Combined audio saved to {combined_audio_path}")
811
  return combined_audio_path
 
815
 
816
 
817
 
818
+
819
+
820
  def fetch_local_events():
821
  api_key = os.environ['SERP_API']
822
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'