Spaces:
Runtime error
Runtime error
start = time.time() | |
pipe = pipes[model_id]["model"] | |
if "device" in pipes[model_id]: | |
try: | |
pipe.to(pipes[model_id]["device"]) | |
except: | |
pipe.device = torch.device(pipes[model_id]["device"]) | |
pipe.model.to(pipes[model_id]["device"]) | |
result = None | |
try: | |
# text to video | |
if model_id == "damo-vilab/text-to-video-ms-1.7b": | |
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
# pipe.enable_model_cpu_offload() | |
prompt = data["text"] | |
video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames | |
file_name = str(uuid.uuid4())[:4] | |
video_path = export_to_video(video_frames, f"public/videos/{file_name}.mp4") | |
new_file_name = str(uuid.uuid4())[:4] | |
os.system(f"ffmpeg -i {video_path} -vcodec libx264 public/videos/{new_file_name}.mp4") | |
if os.path.exists(f"public/videos/{new_file_name}.mp4"): | |
result = {"path": f"/videos/{new_file_name}.mp4"} | |
else: | |
result = {"path": f"/videos/{file_name}.mp4"} | |
# controlnet | |
if model_id.startswith("lllyasviel/sd-controlnet-"): | |
pipe.controlnet.to('cpu') | |
pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"]) | |
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) | |
control_image = load_image(data["img_url"]) | |
# generator = torch.manual_seed(66) | |
out_image: Image = pipe(data["text"], num_inference_steps=20, image=control_image).images[0] | |
file_name = str(uuid.uuid4())[:4] | |
out_image.save(f"public/images/{file_name}.png") | |
result = {"path": f"/images/{file_name}.png"} | |
if model_id.endswith("-control"): | |
image = load_image(data["img_url"]) | |
if "scribble" in model_id: | |
control = pipe(image, scribble = True) | |
elif "canny" in model_id: | |
control = pipe(image, low_threshold=100, high_threshold=200) | |
else: | |
control = pipe(image) | |
file_name = str(uuid.uuid4())[:4] | |
control.save(f"public/images/{file_name}.png") | |
result = {"path": f"/images/{file_name}.png"} | |
# image to image | |
if model_id == "lambdalabs/sd-image-variations-diffusers": | |
im = load_image(data["img_url"]) | |
file_name = str(uuid.uuid4())[:4] | |
with open(f"public/images/{file_name}.png", "wb") as f: | |
f.write(data) | |
tform = transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Resize( | |
(224, 224), | |
interpolation=transforms.InterpolationMode.BICUBIC, | |
antialias=False, | |
), | |
transforms.Normalize( | |
[0.48145466, 0.4578275, 0.40821073], | |
[0.26862954, 0.26130258, 0.27577711]), | |
]) | |
inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0) | |
out = pipe(inp, guidance_scale=3) | |
out["images"][0].save(f"public/images/{file_name}.jpg") | |
result = {"path": f"/images/{file_name}.jpg"} | |
# image to text | |
if model_id == "Salesforce/blip-image-captioning-large": | |
raw_image = load_image(data["img_url"]).convert('RGB') | |
text = data["text"] | |
inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"]) | |
out = pipe.generate(**inputs) | |
caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True) | |
result = {"generated text": caption} | |
if model_id == "ydshieh/vit-gpt2-coco-en": | |
img_url = data["img_url"] | |
generated_text = pipe(img_url)[0]['generated_text'] | |
result = {"generated text": generated_text} | |
if model_id == "nlpconnect/vit-gpt2-image-captioning": | |
image = load_image(data["img_url"]).convert("RGB") | |
pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values | |
pixel_values = pixel_values.to(pipes[model_id]["device"]) | |
generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1}) | |
generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0] | |
result = {"generated text": generated_text} | |
# image to text: OCR | |
if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten": | |
image = load_image(data["img_url"]).convert("RGB") | |
pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values | |
pixel_values = pixel_values.to(pipes[model_id]["device"]) | |
generated_ids = pipe.generate(pixel_values) | |
generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0] | |
result = {"generated text": generated_text} | |
# text to image | |
if model_id == "runwayml/stable-diffusion-v1-5": | |
file_name = str(uuid.uuid4())[:4] | |
text = data["text"] | |
out = pipe(prompt=text) | |
out["images"][0].save(f"public/images/{file_name}.jpg") | |
result = {"path": f"/images/{file_name}.jpg"} | |
# object detection | |
if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101": | |
img_url = data["img_url"] | |
open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"] | |
result = pipe(img_url, candidate_labels=open_types) | |
# VQA | |
if model_id == "dandelin/vilt-b32-finetuned-vqa": | |
question = data["text"] | |
img_url = data["img_url"] | |
result = pipe(question=question, image=img_url) | |
#DQA | |
if model_id == "impira/layoutlm-document-qa": | |
question = data["text"] | |
img_url = data["img_url"] | |
result = pipe(img_url, question) | |
# depth-estimation | |
if model_id == "Intel/dpt-large": | |
output = pipe(data["img_url"]) | |
image = output['depth'] | |
name = str(uuid.uuid4())[:4] | |
image.save(f"public/images/{name}.jpg") | |
result = {"path": f"/images/{name}.jpg"} | |
if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large": | |
image = load_image(data["img_url"]) | |
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = pipe(**inputs) | |
predicted_depth = outputs.predicted_depth | |
prediction = torch.nn.functional.interpolate( | |
predicted_depth.unsqueeze(1), | |
size=image.size[::-1], | |
mode="bicubic", | |
align_corners=False, | |
) | |
output = prediction.squeeze().cpu().numpy() | |
formatted = (output * 255 / np.max(output)).astype("uint8") | |
image = Image.fromarray(formatted) | |
name = str(uuid.uuid4())[:4] | |
image.save(f"public/images/{name}.jpg") | |
result = {"path": f"/images/{name}.jpg"} | |
# TTS | |
if model_id == "espnet/kan-bayashi_ljspeech_vits": | |
text = data["text"] | |
wav = pipe(text)["wav"] | |
name = str(uuid.uuid4())[:4] | |
sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16") | |
result = {"path": f"/audios/{name}.wav"} | |
if model_id == "microsoft/speecht5_tts": | |
text = data["text"] | |
inputs = pipes[model_id]["processor"](text=text, return_tensors="pt") | |
embeddings_dataset = pipes[model_id]["embeddings_dataset"] | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"]) | |
pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) | |
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) | |
name = str(uuid.uuid4())[:4] | |
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) | |
result = {"path": f"/audios/{name}.wav"} | |
# ASR | |
if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr": | |
audio_url = data["audio_url"] | |
result = { "text": pipe(audio_url)["text"]} | |
# audio to audio | |
if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": | |
audio_url = data["audio_url"] | |
wav, sr = torchaudio.load(audio_url) | |
with torch.no_grad(): | |
result_wav = pipe(wav.to(pipes[model_id]["device"])) | |
name = str(uuid.uuid4())[:4] | |
sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr) | |
result = {"path": f"/audios/{name}.wav"} | |
if model_id == "microsoft/speecht5_vc": | |
audio_url = data["audio_url"] | |
wav, sr = torchaudio.load(audio_url) | |
inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt") | |
embeddings_dataset = pipes[model_id]["embeddings_dataset"] | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) | |
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) | |
name = str(uuid.uuid4())[:4] | |
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) | |
result = {"path": f"/audios/{name}.wav"} | |
# segmentation | |
if model_id == "facebook/detr-resnet-50-panoptic": | |
result = [] | |
segments = pipe(data["img_url"]) | |
image = load_image(data["img_url"]) | |
colors = [] | |
for i in range(len(segments)): | |
colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50)) | |
for segment in segments: | |
mask = segment["mask"] | |
mask = mask.convert('L') | |
layer = Image.new('RGBA', mask.size, colors[i]) | |
image.paste(layer, (0, 0), mask) | |
name = str(uuid.uuid4())[:4] | |
image.save(f"public/images/{name}.jpg") | |
result = {"path": f"/images/{name}.jpg"} | |
if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade": | |
image = load_image(data["img_url"]) | |
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"]) | |
outputs = pipe(**inputs) | |
result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0] | |
predicted_panoptic_map = result["segmentation"].cpu().numpy() | |
predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8)) | |
name = str(uuid.uuid4())[:4] | |
predicted_panoptic_map.save(f"public/images/{name}.jpg") | |
result = {"path": f"/images/{name}.jpg"} | |
except Exception as e: | |
print(e) | |
traceback.print_exc() | |
result = {"error": {"message": "Error when running the model inference."}} | |
if "device" in pipes[model_id]: | |
try: | |
pipe.to("cpu") | |
torch.cuda.empty_cache() | |
except: | |
pipe.device = torch.device("cpu") | |
pipe.model.to("cpu") | |
torch.cuda.empty_cache() | |
pipes[model_id]["using"] = False | |
if result is None: | |
result = {"error": {"message": "model not found"}} | |
end = time.time() | |
during = end - start | |
print(f"[ complete {model_id} ] {during}s") | |
print(f"[ result {model_id} ] {result}") | |
return result |