File size: 1,506 Bytes
2bd9468
 
8770d52
2bd9468
9902a40
8770d52
2bd9468
ef8b4e2
 
 
8770d52
 
ef8b4e2
8770d52
 
ef8b4e2
8770d52
 
 
 
87c119f
8770d52
 
 
 
87c119f
8770d52
 
 
c034f68
8770d52
2bd9468
87c119f
23708c8
8770d52
23708c8
8770d52
 
 
23708c8
2bd9468
3382a71
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
from transformers import pipeline
import requests

# ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

# API ํ† ํฐ ๋กœ๋“œ
hugging_face_auth_token = os.getenv("HUGGING_FACE_AUTH_TOKEN")

def get_audiogen(prompt):
    # ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ชจ๋ธ API ํ˜ธ์ถœ
    headers = {"Authorization": f"Bearer {hugging_face_auth_token}"}
    response = requests.post(
        "https://api-inference.huggingface.co/models/fffiloni/audiogen",
        headers=headers,
        json={"inputs": prompt, "parameters": {"length": 10}, "options": {"use_cache": False}}
    )
    result = response.json()
    return result

def classify_and_generate_audio(uploaded_image):
    # ์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # ๊ฐ€์žฅ ํ™•๋ฅ ์ด ๋†’์€ ๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ
    
    # ์˜ค๋””์˜ค ์ƒ์„ฑ
    audio_result = get_audiogen(top_prediction)
    
    # audio_result๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ Gradio๊ฐ€ ์žฌ์ƒํ•  ์ˆ˜ ์žˆ๋Š” ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜
    return top_prediction, audio_result

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
iface = gr.Interface(
    fn=classify_and_generate_audio,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio()],
    title="์ด๋ฏธ์ง€ ๋ถ„๋ฅ˜ ๋ฐ ์˜ค๋””์˜ค ์ƒ์„ฑ",
    description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์ด๋ฏธ์ง€๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๋ฌด์—‡์ธ์ง€ ์„ค๋ช…ํ•˜๊ณ , ํ•ด๋‹นํ•˜๋Š” ์˜ค๋””์˜ค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."
)

# ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
iface.launch()