File size: 3,222 Bytes
2bd9468
 
7047a68
dedab71
9902a40
8770d52
2bd9468
ba5f8a7
0f49e19
f1317a3
ba5f8a7
59ff24b
ba5f8a7
 
db7dc29
b8bb042
ba5f8a7
1784bac
ebcd803
3ba852d
ba5f8a7
0f49e19
 
 
46ad89d
 
 
 
 
 
0f49e19
 
ebcd803
46ad89d
3377e03
a6d7b81
1edfb40
a6d7b81
 
 
 
 
 
 
 
 
87c119f
3377e03
8770d52
 
a6d7b81
 
3377e03
59ff24b
ebcd803
59ff24b
a6d7b81
ebcd803
a6d7b81
7d07c61
ad7babb
a6d7b81
23708c8
3377e03
23708c8
7d07c61
872e164
5174dc4
0ad2ed2
23708c8
2bd9468
ad7babb
3382a71
a6d7b81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from transformers import pipeline
from gradio_client import Client 

# ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_music(prompt):
   """ # audioldm API ์‚ฌ์šฉํ•˜์—ฌ ์Œ์•… ์ƒ์„ฑ API ํ˜ธ์ถœ
    client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
    result = client.predict(
        "playing piano.",	# str in 'Input text' Textbox component
        "Low quality.",	# str in 'Negative prompt' Textbox component
        5,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
        5.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
        5,	# int | float in 'Seed' Number component
        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
        api_name="/text2audio"
    )
    print(result)
    #audio_result = extract_audio(result)
    return result"""
    client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
    result = client.predict(
        prompt="Howdy!",  # 'Input your text here' ํ…์ŠคํŠธ ๋ฐ•์Šค ์ปดํฌ๋„ŒํŠธ
        duration=5,  # 'Duration (seconds)' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (5 ~ 15)
        guidance_scale=0,  # 'Guidance scale' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (0 ~ 6)
        seed=5,  # 'Seed' ์ˆซ์ž ์ปดํฌ๋„ŒํŠธ์˜ ๊ฐ’
        num_waveforms=1,  # 'Number waveforms to generate' ์Šฌ๋ผ์ด๋” ์ปดํฌ๋„ŒํŠธ์—์„œ์˜ ๊ฐ’ ๋ฒ”์œ„ (1 ~ 3)
        api_name="/text2audio"  # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
    )
    print(result)


def generate_voice(prompt):
    # Tango API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์Œ์„ฑ ์ƒ์„ฑ
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ๋ฅผ ํ”„๋กฌํ”„ํŠธ๋กœ ์‚ฌ์šฉ
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
    )
    # Tango API ํ˜ธ์ถœ ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
    # ์˜ˆ: result์—์„œ ์Œ์„ฑ ํŒŒ์ผ URL ๋˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
    return result

def classify_and_generate_voice(uploaded_image):
    # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # ๊ฐ€์žฅ ํ™•๋ฅ ์ด ๋†’์€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ
    
    # ์Œ์„ฑ ์ƒ์„ฑ
    voice_result = generate_voice("this is " + top_prediction)
    # ์Œ์•… ์ƒ์„ฑ
    music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
    
    # ๋ฐ˜ํ™˜๋œ ์Œ์„ฑ ๋ฐ ์Œ์•… ๊ฒฐ๊ณผ๋ฅผ Gradio ์ธํ„ฐํŽ˜์ด์Šค๋กœ ์ „๋‹ฌ
    # ์˜ˆ: voice_result['url'] ๋˜๋Š” voice_result['audio_data'] ๋“ฑ
    return top_prediction, voice_result, music_result
    
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio(), gr.Audio()],
    title="msVision_3",
    description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ ๋ฐ ์Œ์•…์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.(recognizes object and generate Voice&Music)",
    examples=["dog.jpg","cafe.jpg","seoul.png"]
)


# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
iface.launch()