BarBar288 commited on
Commit
c8216a6
·
verified ·
1 Parent(s): 8132a50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -1
app.py CHANGED
@@ -56,6 +56,16 @@ object_detection_pipeline = pipeline("object-detection", model="facebook/detr-re
56
  video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400")
57
  summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
58
 
 
 
 
 
 
 
 
 
 
 
59
  # Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
60
  try:
61
  text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0", use_auth_token=read_token)
@@ -63,6 +73,7 @@ except ValueError as e:
63
  logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
64
  logger.info("Falling back to a different text-to-audio model.")
65
  text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts")
 
66
 
67
  audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
68
 
@@ -140,7 +151,8 @@ def summarize_text(text):
140
  return result[0]["summary_text"]
141
 
142
  def text_to_audio(text):
143
- result = text_to_audio_pipeline(text)
 
144
  return result["audio"]
145
 
146
  def audio_classification(audio):
 
56
  video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400")
57
  summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
58
 
59
+ # Load speaker embeddings for text-to-audio
60
+ def load_speaker_embeddings(model_name):
61
+ if model_name == "microsoft/speecht5_tts":
62
+ logger.info("Loading speaker embeddings for SpeechT5")
63
+ from datasets import load_dataset
64
+ dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
65
+ speaker_embeddings = torch.tensor(dataset[7306]["xvector"]).unsqueeze(0) # Example speaker
66
+ return speaker_embeddings
67
+ return None
68
+
69
  # Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
70
  try:
71
  text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0", use_auth_token=read_token)
 
73
  logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
74
  logger.info("Falling back to a different text-to-audio model.")
75
  text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts")
76
+ speaker_embeddings = load_speaker_embeddings("microsoft/speecht5_tts")
77
 
78
  audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
79
 
 
151
  return result[0]["summary_text"]
152
 
153
  def text_to_audio(text):
154
+ global speaker_embeddings
155
+ result = text_to_audio_pipeline(text, speaker_embeddings=speaker_embeddings)
156
  return result["audio"]
157
 
158
  def audio_classification(audio):