File size: 1,624 Bytes
2bd9468
 
a6d7b81
2bd9468
9902a40
8770d52
2bd9468
3377e03
a6d7b81
 
 
 
 
 
 
 
 
 
 
87c119f
3377e03
8770d52
 
a6d7b81
 
3377e03
 
a6d7b81
 
 
3377e03
ad7babb
a6d7b81
23708c8
3377e03
23708c8
8770d52
872e164
ad7babb
 
23708c8
2bd9468
ad7babb
3382a71
a6d7b81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
from transformers import pipeline
from gradio_client import Client  # ๊ฐ€์ •: gradio_client ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•˜๋‹ค.

# ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_voice(prompt):
    # Tango API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์Œ์„ฑ ์ƒ์„ฑ
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ๋ฅผ ํ”„๋กฌํ”„ํŠธ๋กœ ์‚ฌ์šฉ
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API ์—”๋“œํฌ์ธํŠธ ๊ฒฝ๋กœ
    )
    # Tango API ํ˜ธ์ถœ ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
    # ์˜ˆ: result์—์„œ ์Œ์„ฑ ํŒŒ์ผ URL ๋˜๋Š” ๋ฐ์ดํ„ฐ ์ถ”์ถœ
    return result

def classify_and_generate_voice(uploaded_image):
    # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # ๊ฐ€์žฅ ํ™•๋ฅ ์ด ๋†’์€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ
    
    # ์Œ์„ฑ ์ƒ์„ฑ
    voice_result = generate_voice(top_prediction)
    
    # ๋ฐ˜ํ™˜๋œ ์Œ์„ฑ ๊ฒฐ๊ณผ๋ฅผ Gradio ์ธํ„ฐํŽ˜์ด์Šค๋กœ ์ „๋‹ฌ
    # ์˜ˆ: voice_result['url'] ๋˜๋Š” voice_result['audio_data'] ๋“ฑ
    return top_prediction, voice_result
    
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio()],
    title="msVision_3",
    description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.",
    examples=["dog.jpg", "cat.jpg"]  # ์ˆ˜์ •๋œ ๋ถ€๋ถ„: ์ฝค๋งˆ ์ถ”๊ฐ€
)


# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
iface.launch()