Spaces:

matanmichaely
/

image_to_audio_story

Running

App Files Files Community

image_to_audio_story / app.py

matanmichaely

Update app.py

d7ef93f almost 2 years ago

raw

history blame

1.8 kB

	from dotenv import find_dotenv, load_dotenv
	from transformers import pipeline
	import streamlit as st
	import os


	# load env variables from .env file
	load_dotenv(find_dotenv())

	# img to text
	def img_to_text(url):
	image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

	text = image_to_text(url)[0]["generated_text"]
	return text


	# llm
	def generate_story(text):
	generator = pipeline("text-generation", model="distilgpt2")

	result = generator(text, max_length=20, num_return_sequences=1)
	return result[0]['generated_text']


	#
	# text-to-speech
	def text_to_speech(text):
	import requests

	API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
	headers = {"Authorization": f"Bearer {os.environ.get('HUGGINGFACE_API_TOKEN')}"}
	payload = {
	"inputs": text
	}

	response = requests.post(API_URL, headers=headers, json=payload)
	response.raise_for_status()
	with open('audio.flac', 'wb') as file:
	file.write(response.content)



	def main():
	st.set_page_config(page_title="img to audio story")
	st.header("turn image to audio story")
	uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")

	if uploaded_file is not None:
	print(uploaded_file)
	bytes_data = uploaded_file.getvalue()
	with open(uploaded_file.name, "wb") as file:
	file.write(bytes_data)
	st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
	text = img_to_text(uploaded_file.name)
	story = generate_story(text)
	text_to_speech(story)

	with st.expander("text"):
	st.write(text)
	with st.expander("story"):
	st.write(story)
	st.audio("audio.flac")

	main()