msVision_3

Runtime error

App Files Files Community

msVision_3 / app.py

seawolf2357

Update app.py

46ad89d verified over 1 year ago

raw

history blame

3.22 kB

	import gradio as gr
	from transformers import pipeline
	from gradio_client import Client

	# 이미지 인식 파이프라인 로드
	image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

	def generate_music(prompt):
	""" # audioldm API 사용하여 음악 생성 API 호출
	client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
	result = client.predict(
	"playing piano.", # str in 'Input text' Textbox component
	"Low quality.", # str in 'Negative prompt' Textbox component
	5, # int \| float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
	5.5, # int \| float (numeric value between 0 and 7) in 'Guidance scale' Slider component
	5, # int \| float in 'Seed' Number component
	3, # int \| float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
	api_name="/text2audio"
	)
	print(result)
	#audio_result = extract_audio(result)
	return result"""
	client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
	result = client.predict(
	prompt="Howdy!", # 'Input your text here' 텍스트 박스 컴포넌트
	duration=5, # 'Duration (seconds)' 슬라이더 컴포넌트에서의 값 범위 (5 ~ 15)
	guidance_scale=0, # 'Guidance scale' 슬라이더 컴포넌트에서의 값 범위 (0 ~ 6)
	seed=5, # 'Seed' 숫자 컴포넌트의 값
	num_waveforms=1, # 'Number waveforms to generate' 슬라이더 컴포넌트에서의 값 범위 (1 ~ 3)
	api_name="/text2audio" # API 엔드포인트 경로
	)
	print(result)


	def generate_voice(prompt):
	# Tango API를 사용하여 음성 생성
	client = Client("https://declare-lab-tango.hf.space/")
	result = client.predict(
	prompt, # 이미지 분류 결과를 프롬프트로 사용
	100, # Steps
	1, # Guidance Scale
	api_name="/predict" # API 엔드포인트 경로
	)
	# Tango API 호출 결과 처리
	# 예: result에서 음성 파일 URL 또는 데이터 추출
	return result

	def classify_and_generate_voice(uploaded_image):
	# 이미지 분류
	predictions = image_model(uploaded_image)
	top_prediction = predictions[0]['label'] # 가장 확률이 높은 분류 결과

	# 음성 생성
	voice_result = generate_voice("this is " + top_prediction)
	# 음악 생성
	music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")

	# 반환된 음성 및 음악 결과를 Gradio 인터페이스로 전달
	# 예: voice_result['url'] 또는 voice_result['audio_data'] 등
	return top_prediction, voice_result, music_result

	# Gradio 인터페이스 생성
	iface = gr.Interface(
	fn=classify_and_generate_voice,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Label(), gr.Audio(), gr.Audio()],
	title="msVision_3",
	description="이미지를 업로드하면, 사물을 인식하고 해당하는 음성 및 음악을 생성합니다.(recognizes object and generate Voice&Music)",
	examples=["dog.jpg","cafe.jpg","seoul.png"]
	)


	# 인터페이스 실행
	iface.launch()