roots / app.py
haepada's picture
Update app.py
9f7512d verified
raw
history blame
5.4 kB
import gradio as gr
import numpy as np
import librosa
from transformers import pipeline
import json
# Initialize AI models
emotion_analyzer = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2")
speech_recognizer = pipeline("automatic-speech-recognition",
model="kresnik/wav2vec2-large-xlsr-korean")
# Global state management
current_stage = "intro"
session_data = {}
def create_interface():
with gr.Blocks(theme=gr.themes.Soft()) as app:
# State management
state = gr.State(value={"stage": "intro", "session_data": {}})
# Header
gr.Markdown("# 디지털 굿판")
# Navigation tabs
with gr.Tabs() as tabs:
# Intro/세계관 Stage
with gr.Tab("입장", id="intro"):
gr.Markdown("""
# 디지털 굿판에 오신 것을 환영합니다
온천천의 디지털 치유 공간으로 들어가보세요.
""")
intro_next = gr.Button("여정 시작하기")
# 청신 Stage (Sound Purification)
with gr.Tab("청신", id="cleansing", visible=False):
with gr.Row():
audio_player = gr.Audio(
value="path_to_default_sound.mp3", # 기본 사운드 파일
type="filepath",
label="온천천의 소리"
)
location_info = gr.Textbox(
label="현재 위치",
value="온천장역",
interactive=False
)
cleansing_next = gr.Button("다음 단계로")
# 기원 Stage (Voice Analysis)
with gr.Tab("기원", id="voice", visible=False):
with gr.Row():
# Voice input component
voice_input = gr.Audio(
label="목소리로 전하기",
sources=["microphone", "upload"],
type="filepath"
)
# Analysis results
with gr.Column():
emotion_output = gr.JSON(
label="감정 분석 결과",
visible=True
)
text_output = gr.Textbox(
label="음성 텍스트",
visible=True
)
voice_next = gr.Button("다음 단계로")
# 송신 Stage (Sharing)
with gr.Tab("송신", id="sharing", visible=False):
with gr.Row():
gr.Gallery(
label="생성된 이미지",
show_label=True,
elem_id="gallery"
)
gr.Markdown("## 공동체와 함께 나누기")
complete_button = gr.Button("완료")
# Floating navigation menu
with gr.Row(visible=True) as float_menu:
gr.Button("🏠", scale=1)
gr.Button("🎵", scale=1)
gr.Button("🎤", scale=1)
gr.Button("🖼️", scale=1)
# Voice analysis function
def analyze_voice(audio_file, state):
try:
if audio_file is None:
return {"error": "No audio input provided"}, state
# Load audio
y, sr = librosa.load(audio_file)
# Emotion analysis
emotions = emotion_analyzer(y)
primary_emotion = emotions[0]
# Speech to text
text_result = speech_recognizer(y)
# Update state
state["voice_analysis"] = {
"emotion": primary_emotion['label'],
"probability": float(primary_emotion['score']),
"text": text_result['text']
}
return {
"emotion": primary_emotion['label'],
"emotion_probability": f"{primary_emotion['score']:.2f}",
"transcribed_text": text_result['text'],
"status": "Analysis complete"
}, state
except Exception as e:
return {"error": str(e), "status": "Error occurred"}, state
# Event handlers
voice_input.change(
fn=analyze_voice,
inputs=[voice_input, state],
outputs=[emotion_output, state]
)
# Stage navigation
intro_next.click(
fn=lambda s: {"stage": "cleansing", **s},
inputs=[state],
outputs=[state],
)
cleansing_next.click(
fn=lambda s: {"stage": "voice", **s},
inputs=[state],
outputs=[state],
)
voice_next.click(
fn=lambda s: {"stage": "sharing", **s},
inputs=[state],
outputs=[state],
)
return app
# Launch the application
if __name__ == "__main__":
app = create_interface()
app.launch()