File size: 2,538 Bytes
2bd9468
 
a6d7b81
2bd9468
9902a40
8770d52
2bd9468
64f134f
ebcd803
 
bc5fe2a
 
 
 
 
 
093e936
ebcd803
093e936
bc5fe2a
 
ebcd803
 
 
3377e03
a6d7b81
 
 
 
 
 
 
 
 
 
 
87c119f
3377e03
8770d52
 
a6d7b81
 
3377e03
 
ebcd803
 
a6d7b81
ebcd803
a6d7b81
ebcd803
ad7babb
a6d7b81
23708c8
3377e03
23708c8
e6812f7
872e164
5174dc4
 
23708c8
2bd9468
ad7babb
3382a71
a6d7b81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
from transformers import pipeline
from gradio_client import Client  # ๊ฐ€์ •: gradio_client ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•˜๋‹ค.

# ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_music(image_classification_result):
    # ์Œ์•… ์ƒ์„ฑ API ํ˜ธ์ถœ
    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
    "The rnb beat of 85BPM drums. playing Violin."
    "Low quality."
    5,  # ์Œ์•…์˜ ๊ธธ์ด (์ดˆ), ํ‚ค์›Œ๋“œ ์ธ์ž๋กœ ๋ณ€๊ฒฝ
    0,  # ๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ, ํ‚ค์›Œ๋“œ ์ธ์ž๋กœ ๋ณ€๊ฒฝ
    5,  # ์‹œ๋“œ ๊ฐ’, ํ‚ค์›Œ๋“œ ์ธ์ž๋กœ ๋ณ€๊ฒฝ
    1,  # ์ƒ์„ฑํ•  waveform์˜ ์ˆ˜, ํ‚ค์›Œ๋“œ ์ธ์ž๋กœ ๋ณ€๊ฒฝ
    fn_index=1  # ํ•จ์ˆ˜ ์ธ๋ฑ์Šค, ์ด๋ฏธ ํ‚ค์›Œ๋“œ ์ธ์ž๋กœ ๋˜์–ด ์žˆ์Œ
    )

    # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ๋ฅผ ํฌํ•จํ•œ ์ „์ฒด ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
    full_prompt = f"{base_prompt} Inspired by: {image_classification_result}."
    # API ํ˜ธ์ถœ ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
    return result

def generate_voice(prompt):
    # Tango API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์Œ์„ฑ ์ƒ์„ฑ
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ๋ฅผ ํ”„๋กฌํ”„ํŠธ๋กœ ์‚ฌ์šฉ
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
    )
    # Tango API ํ˜ธ์ถœ ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
    # ์˜ˆ: result์—์„œ ์Œ์„ฑ ํŒŒ์ผ URL ๋˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
    return result

def classify_and_generate_voice(uploaded_image):
    # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # ๊ฐ€์žฅ ํ™•๋ฅ ์ด ๋†’์€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ
    
    # ์Œ์„ฑ ์ƒ์„ฑ
    voice_result = generate_voice(top_prediction)
    # ์Œ์•… ์ƒ์„ฑ
    music_result = generate_music("Generate music for: " + top_prediction)
    
    # ๋ฐ˜ํ™˜๋œ ์Œ์„ฑ ๋ฐ ์Œ์•… ๊ฒฐ๊ณผ๋ฅผ Gradio ์ธํ„ฐํŽ˜์ด์Šค๋กœ ์ „๋‹ฌ
    # ์˜ˆ: voice_result['url'] ๋˜๋Š” voice_result['audio_data'] ๋“ฑ
    return top_prediction, voice_result, music_result
    
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio(), gr.Audio()],
    title="msVision_3",
    description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ ๋ฐ ์Œ์•…์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.(recognizes object and generate Voice&Music)",
    examples=["dog.jpg", "cat.png", "cafe.jpg"]
)


# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
iface.launch()