msVision_3 / app.py
seawolf2357's picture
Update app.py
46ad89d verified
raw
history blame
3.22 kB
import gradio as gr
from transformers import pipeline
from gradio_client import Client
# ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")
def generate_music(prompt):
""" # audioldm API ์‚ฌ์šฉํ•˜์—ฌ ์Œ์•… ์ƒ์„ฑ API ํ˜ธ์ถœ
client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
result = client.predict(
"playing piano.", # str in 'Input text' Textbox component
"Low quality.", # str in 'Negative prompt' Textbox component
5, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
5.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
5, # int | float in 'Seed' Number component
3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
api_name="/text2audio"
)
print(result)
#audio_result = extract_audio(result)
return result"""
client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
result = client.predict(
prompt="Howdy!", # 'Input your text here' ํ…์ŠคํŠธ ๋ฐ•์Šค ์ปดํฌ๋„ŒํŠธ
duration=5, # 'Duration (seconds)' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (5 ~ 15)
guidance_scale=0, # 'Guidance scale' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (0 ~ 6)
seed=5, # 'Seed' ์ˆซ์ž ์ปดํฌ๋„ŒํŠธ์˜ ๊ฐ’
num_waveforms=1, # 'Number waveforms to generate' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (1 ~ 3)
api_name="/text2audio" # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
)
print(result)
def generate_voice(prompt):
# Tango API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์Œ์„ฑ ์ƒ์„ฑ
client = Client("https://declare-lab-tango.hf.space/")
result = client.predict(
prompt, # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ๋ฅผ ํ”„๋กฌํ”„ํŠธ๋กœ ์‚ฌ์šฉ
100, # Steps
1, # Guidance Scale
api_name="/predict" # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
)
# Tango API ํ˜ธ์ถœ ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
# ์˜ˆ: result์—์„œ ์Œ์„ฑ ํŒŒ์ผ URL ๋˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
return result
def classify_and_generate_voice(uploaded_image):
# ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜
predictions = image_model(uploaded_image)
top_prediction = predictions[0]['label'] # ๊ฐ€์žฅ ํ™•๋ฅ ์ด ๋†’์€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ
# ์Œ์„ฑ ์ƒ์„ฑ
voice_result = generate_voice("this is " + top_prediction)
# ์Œ์•… ์ƒ์„ฑ
music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
# ๋ฐ˜ํ™˜๋œ ์Œ์„ฑ ๋ฐ ์Œ์•… ๊ฒฐ๊ณผ๋ฅผ Gradio ์ธํ„ฐํŽ˜์ด์Šค๋กœ ์ „๋‹ฌ
# ์˜ˆ: voice_result['url'] ๋˜๋Š” voice_result['audio_data'] ๋“ฑ
return top_prediction, voice_result, music_result
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
iface = gr.Interface(
fn=classify_and_generate_voice,
inputs=gr.Image(type="pil"),
outputs=[gr.Label(), gr.Audio(), gr.Audio()],
title="msVision_3",
description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ ๋ฐ ์Œ์•…์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.(recognizes object and generate Voice&Music)",
examples=["dog.jpg","cafe.jpg","seoul.png"]
)
# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
iface.launch()