BarBar288 commited on
Commit
a88c901
·
verified ·
1 Parent(s): b6b21f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -52,12 +52,12 @@ text_to_image_pipelines = {}
52
  text_to_speech_pipelines = {}
53
 
54
  # Initialize pipelines for other tasks
55
- visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa", device=device)
56
- document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
57
- image_classification_pipeline = pipeline("image-classification", model="facebook/deit-base-distilled-patch16-224", device=device)
58
- object_detection_pipeline = pipeline("object-detection", model="facebook/detr-resnet-50", device=device)
59
- video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400", device=device)
60
- summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
61
 
62
  # Load speaker embeddings for text-to-audio
63
  def load_speaker_embeddings(model_name):
@@ -65,26 +65,26 @@ def load_speaker_embeddings(model_name):
65
  logger.info("Loading speaker embeddings for SpeechT5")
66
  from datasets import load_dataset
67
  dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
68
- speaker_embeddings = torch.tensor(dataset[7306]["xvector"]).unsqueeze(0).to(device) # Example speaker
69
  return speaker_embeddings
70
  return None
71
 
72
  # Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
73
  try:
74
- text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0", device=device)
75
  except ValueError as e:
76
  logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
77
  logger.info("Falling back to a different text-to-audio model.")
78
- text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts", device=device)
79
  speaker_embeddings = load_speaker_embeddings("microsoft/speecht5_tts")
80
 
81
- audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base", device=device)
82
 
83
  def load_conversational_model(model_name):
84
  if model_name not in conversational_models_loaded:
85
  logger.info(f"Loading conversational model: {model_name}")
86
  tokenizer = AutoTokenizer.from_pretrained(conversational_models[model_name], use_auth_token=read_token)
87
- model = AutoModelForCausalLM.from_pretrained(conversational_models[model_name], use_auth_token=read_token).to(device)
88
  conversational_tokenizers[model_name] = tokenizer
89
  conversational_models_loaded[model_name] = model
90
  return conversational_tokenizers[model_name], conversational_models_loaded[model_name]
@@ -93,7 +93,7 @@ def chat(model_name, user_input, history=[]):
93
  tokenizer, model = load_conversational_model(model_name)
94
 
95
  # Encode the input
96
- input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt").to(device)
97
 
98
  # Generate a response
99
  with torch.no_grad():
@@ -113,7 +113,7 @@ def generate_image(model_name, prompt):
113
  if model_name not in text_to_image_pipelines:
114
  logger.info(f"Loading text-to-image model: {model_name}")
115
  text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(
116
- text_to_image_models[model_name], use_auth_token=read_token, torch_dtype=torch.float16, device_map="auto"
117
  )
118
  pipeline = text_to_image_pipelines[model_name]
119
  image = pipeline(prompt).images[0]
@@ -123,7 +123,7 @@ def generate_speech(model_name, text):
123
  if model_name not in text_to_speech_pipelines:
124
  logger.info(f"Loading text-to-speech model: {model_name}")
125
  text_to_speech_pipelines[model_name] = pipeline(
126
- "text-to-speech", model=text_to_speech_models[model_name], use_auth_token=read_token, device=device
127
  )
128
  pipeline = text_to_speech_pipelines[model_name]
129
  audio = pipeline(text, speaker_embeddings=speaker_embeddings)
 
52
  text_to_speech_pipelines = {}
53
 
54
  # Initialize pipelines for other tasks
55
+ visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
56
+ document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
57
+ image_classification_pipeline = pipeline("image-classification", model="facebook/deit-base-distilled-patch16-224")
58
+ object_detection_pipeline = pipeline("object-detection", model="facebook/detr-resnet-50", =)
59
+ video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400")
60
+ summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
61
 
62
  # Load speaker embeddings for text-to-audio
63
  def load_speaker_embeddings(model_name):
 
65
  logger.info("Loading speaker embeddings for SpeechT5")
66
  from datasets import load_dataset
67
  dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
68
+ speaker_embeddings = torch.tensor(dataset[7306]["xvector"]).unsqueeze(0).to() # Example speaker
69
  return speaker_embeddings
70
  return None
71
 
72
  # Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
73
  try:
74
+ text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0")
75
  except ValueError as e:
76
  logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
77
  logger.info("Falling back to a different text-to-audio model.")
78
+ text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts")
79
  speaker_embeddings = load_speaker_embeddings("microsoft/speecht5_tts")
80
 
81
+ audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
82
 
83
  def load_conversational_model(model_name):
84
  if model_name not in conversational_models_loaded:
85
  logger.info(f"Loading conversational model: {model_name}")
86
  tokenizer = AutoTokenizer.from_pretrained(conversational_models[model_name], use_auth_token=read_token)
87
+ model = AutoModelForCausalLM.from_pretrained(conversational_models[model_name], use_auth_token=read_token).to()
88
  conversational_tokenizers[model_name] = tokenizer
89
  conversational_models_loaded[model_name] = model
90
  return conversational_tokenizers[model_name], conversational_models_loaded[model_name]
 
93
  tokenizer, model = load_conversational_model(model_name)
94
 
95
  # Encode the input
96
+ input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt").to()
97
 
98
  # Generate a response
99
  with torch.no_grad():
 
113
  if model_name not in text_to_image_pipelines:
114
  logger.info(f"Loading text-to-image model: {model_name}")
115
  text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(
116
+ text_to_image_models[model_name], use_auth_token=read_token, torch_dtype=torch.float16, _map="auto"
117
  )
118
  pipeline = text_to_image_pipelines[model_name]
119
  image = pipeline(prompt).images[0]
 
123
  if model_name not in text_to_speech_pipelines:
124
  logger.info(f"Loading text-to-speech model: {model_name}")
125
  text_to_speech_pipelines[model_name] = pipeline(
126
+ "text-to-speech", model=text_to_speech_models[model_name], use_auth_token=read_token
127
  )
128
  pipeline = text_to_speech_pipelines[model_name]
129
  audio = pipeline(text, speaker_embeddings=speaker_embeddings)