Spaces:

krishnapal2308
/

eye_for_blind

Sleeping

eye_for_blind / app.py

krishnapal2308

fixed example caching

8692c47 9 months ago

2.55 kB

	import tempfile
	import gradio as gr
	from gtts import gTTS
	import inference_script
	import vit_gpt2
	import os
	import warnings

	warnings.filterwarnings('ignore')


	def process_image_and_generate_output(image, model_selection):
	if image is None:
	return "Please select an image", None
	# (Trained only for 15 epochs without any hyperparameter tuning, utilizing inception v3)'
	# (SOTA model for Image captioning)
	if model_selection == 'Basic Model':
	result = inference_script.evaluate(image)
	pred_caption = ' '.join(result).rsplit(' ', 1)[0]
	pred_caption = pred_caption.replace('<unk>', '')
	elif model_selection == 'ViT-GPT2':
	result = vit_gpt2.predict_step(image)
	pred_caption = result[0]
	else:
	return "Invalid model selection", None

	# Generate speech from the caption
	tts = gTTS(text=pred_caption, lang='en', slow=False)
	with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
	audio_file_path = temp_audio.name
	tts.save(audio_file_path)

	# Read the audio file
	with open(audio_file_path, "rb") as f:
	audio_content = f.read()

	# Clean up the temporary audio file
	os.unlink(audio_file_path)
	return pred_caption, audio_content


	sample_images = [
	[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"],
	[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), 'Basic Model'],
	[os.path.join(os.path.dirname(__file__), "sample_images/2.jpg"), "ViT-GPT2"],
	[os.path.join(os.path.dirname(__file__), "sample_images/2.jpg"), 'Basic Model'],
	[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"],
	[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), 'Basic Model']
	]


	# Create a dropdown to select sample image
	image_input = gr.Image(label="Upload Image")

	# Create a dropdown to choose the model
	model_selection_input = gr.Radio(["Basic Model",
	"ViT-GPT2"],
	label="Choose Model")

	iface = gr.Interface(fn=process_image_and_generate_output,
	inputs=[image_input, model_selection_input],
	outputs=["text", "audio"],
	examples=sample_images,
	cache_examples=True,
	allow_flagging='never',
	title="Eye For Blind \| Image Captioning & TTS",
	description="To be added")

	iface.launch()