Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,7 @@ from diffusers import StableDiffusionPipeline
|
|
4 |
import torch
|
5 |
import os
|
6 |
import logging
|
7 |
-
from huggingface_hub import login
|
8 |
-
import requests
|
9 |
import accelerate
|
10 |
|
11 |
# Set up logging
|
@@ -56,12 +55,12 @@ text_to_speech_pipelines = {}
|
|
56 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
57 |
logger.info(f"Device set to use {device}")
|
58 |
|
59 |
-
visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
60 |
-
document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
61 |
-
image_classification_pipeline = pipeline("image-classification", model="facebook/deit-base-distilled-patch16-224")
|
62 |
-
object_detection_pipeline = pipeline("object-detection", model="facebook/detr-resnet-50")
|
63 |
-
video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400")
|
64 |
-
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
|
65 |
|
66 |
# Load speaker embeddings for text-to-audio
|
67 |
def load_speaker_embeddings(model_name):
|
@@ -75,14 +74,14 @@ def load_speaker_embeddings(model_name):
|
|
75 |
|
76 |
# Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
|
77 |
try:
|
78 |
-
text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0")
|
79 |
except ValueError as e:
|
80 |
logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
|
81 |
logger.info("Falling back to a different text-to-audio model.")
|
82 |
-
text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts")
|
83 |
speaker_embeddings = load_speaker_embeddings("microsoft/speecht5_tts")
|
84 |
|
85 |
-
audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
|
86 |
|
87 |
def load_conversational_model(model_name):
|
88 |
if model_name not in conversational_models_loaded:
|
@@ -92,11 +91,22 @@ def load_conversational_model(model_name):
|
|
92 |
use_auth_token=read_token,
|
93 |
trust_remote_code=True
|
94 |
)
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
conversational_tokenizers[model_name] = tokenizer
|
101 |
conversational_models_loaded[model_name] = model
|
102 |
return conversational_tokenizers[model_name], conversational_models_loaded[model_name]
|
@@ -105,7 +115,7 @@ def chat(model_name, user_input, history=[]):
|
|
105 |
tokenizer, model = load_conversational_model(model_name)
|
106 |
|
107 |
# Encode the input
|
108 |
-
input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
|
109 |
|
110 |
# Generate a response
|
111 |
with torch.no_grad():
|
@@ -127,8 +137,8 @@ def generate_image(model_name, prompt):
|
|
127 |
text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(
|
128 |
text_to_image_models[model_name],
|
129 |
use_auth_token=read_token,
|
130 |
-
torch_dtype=torch.float16,
|
131 |
-
device_map="auto"
|
132 |
)
|
133 |
pipeline = text_to_image_pipelines[model_name]
|
134 |
image = pipeline(prompt).images[0]
|
|
|
4 |
import torch
|
5 |
import os
|
6 |
import logging
|
7 |
+
from huggingface_hub import login
|
|
|
8 |
import accelerate
|
9 |
|
10 |
# Set up logging
|
|
|
55 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
56 |
logger.info(f"Device set to use {device}")
|
57 |
|
58 |
+
visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa", device=device)
|
59 |
+
document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
|
60 |
+
image_classification_pipeline = pipeline("image-classification", model="facebook/deit-base-distilled-patch16-224", device=device)
|
61 |
+
object_detection_pipeline = pipeline("object-detection", model="facebook/detr-resnet-50", device=device)
|
62 |
+
video_classification_pipeline = pipeline("video-classification", model="facebook/timesformer-base-finetuned-k400", device=device)
|
63 |
+
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
|
64 |
|
65 |
# Load speaker embeddings for text-to-audio
|
66 |
def load_speaker_embeddings(model_name):
|
|
|
74 |
|
75 |
# Use a different model for text-to-audio if stabilityai/stable-audio-open-1.0 is not supported
|
76 |
try:
|
77 |
+
text_to_audio_pipeline = pipeline("text-to-audio", model="stabilityai/stable-audio-open-1.0", device=device)
|
78 |
except ValueError as e:
|
79 |
logger.error(f"Error loading stabilityai/stable-audio-open-1.0: {e}")
|
80 |
logger.info("Falling back to a different text-to-audio model.")
|
81 |
+
text_to_audio_pipeline = pipeline("text-to-audio", model="microsoft/speecht5_tts", device=device)
|
82 |
speaker_embeddings = load_speaker_embeddings("microsoft/speecht5_tts")
|
83 |
|
84 |
+
audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base", device=device)
|
85 |
|
86 |
def load_conversational_model(model_name):
|
87 |
if model_name not in conversational_models_loaded:
|
|
|
91 |
use_auth_token=read_token,
|
92 |
trust_remote_code=True
|
93 |
)
|
94 |
+
try:
|
95 |
+
model = AutoModelForCausalLM.from_pretrained(
|
96 |
+
conversational_models[model_name],
|
97 |
+
use_auth_token=read_token,
|
98 |
+
trust_remote_code=True,
|
99 |
+
device_map="auto" if torch.cuda.is_available() else "cpu"
|
100 |
+
)
|
101 |
+
except RuntimeError as e:
|
102 |
+
logger.error(f"RuntimeError: {e}")
|
103 |
+
logger.info("Falling back to CPU for the model.")
|
104 |
+
model = AutoModelForCausalLM.from_pretrained(
|
105 |
+
conversational_models[model_name],
|
106 |
+
use_auth_token=read_token,
|
107 |
+
trust_remote_code=True,
|
108 |
+
device_map="cpu"
|
109 |
+
)
|
110 |
conversational_tokenizers[model_name] = tokenizer
|
111 |
conversational_models_loaded[model_name] = model
|
112 |
return conversational_tokenizers[model_name], conversational_models_loaded[model_name]
|
|
|
115 |
tokenizer, model = load_conversational_model(model_name)
|
116 |
|
117 |
# Encode the input
|
118 |
+
input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt").to(device)
|
119 |
|
120 |
# Generate a response
|
121 |
with torch.no_grad():
|
|
|
137 |
text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(
|
138 |
text_to_image_models[model_name],
|
139 |
use_auth_token=read_token,
|
140 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
141 |
+
device_map="auto" if torch.cuda.is_available() else "cpu"
|
142 |
)
|
143 |
pipeline = text_to_image_pipelines[model_name]
|
144 |
image = pipeline(prompt).images[0]
|