vid2voiceover

Sleeping

App Files Files Community

vid2voiceover / app.py

tsi-org

Update app.py

b22f90d verified about 1 year ago

raw

history blame

7.3 kB

	from dotenv import load_dotenv
	import streamlit as st
	from moviepy.editor import VideoFileClip, AudioFileClip
	import cv2
	import base64
	import io
	import openai
	import os
	import requests
	import tempfile

	# Load environment variables from .env.local
	load_dotenv('.env.local')

	def check_password():
	correct_password = os.getenv('PASSWORD')
	if correct_password is None:
	st.error("Password is not set in .env.local")
	return False

	user_password = st.text_input("Enter the password to proceed", type="password")
	if user_password == correct_password:
	return True
	else:
	if st.button("Check Password"):
	st.error("Incorrect password")
	return False

	def video_to_frames(video_file, frame_sampling_rate=1):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
	tmpfile.write(video_file.read())
	video_filename = tmpfile.name

	video_clip = VideoFileClip(video_filename)
	video_duration = video_clip.duration
	fps = video_clip.fps
	frames_to_skip = int(fps * frame_sampling_rate)

	video = cv2.VideoCapture(video_filename)
	base64Frame = []
	current_frame = 0

	while video.isOpened():
	success, frame = video.read()
	if not success:
	break
	if current_frame % frames_to_skip == 0:
	_, buffer = cv2.imencode('.jpg', frame)
	base64Frame.append(base64.b64encode(buffer).decode("utf-8"))
	current_frame += 1

	video.release()
	print(f"{len(base64Frame)} frames read at a sampling rate of {frame_sampling_rate} second(s) per frame.")
	return base64Frame, video_filename, video_duration

	def frames_to_story(base64Frames, prompt, api_key):
	PROMPT_MESSAGES = [
	{
	"role": "user",
	"content": [
	prompt,
	*map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
	],
	},
	]
	params = {
	"model": "gpt-4-vision-preview",
	"messages": PROMPT_MESSAGES,
	"api_key": api_key,
	"headers": {"Openai-Version": "2020-11-07"},
	"max_tokens": 700,
	}
	result = openai.ChatCompletion.create(**params)
	print(result.choices[0].message.content)
	return result.choices[0].message.content

	def text_to_audio(text, api_key, voice):
	response = requests.post(
	"https://api.openai.com/v1/audio/speech",
	headers={
	"Authorization": f"Bearer {api_key}",
	},
	json={
	"model": "tts-1",
	"input": text,
	"voice": voice,
	},
	)

	if response.status_code != 200:
	raise Exception("Request failed with status code")

	audio_bytes_io = io.BytesIO()
	for chunk in response.iter_content(chunk_size=1024*1024):
	audio_bytes_io.write(chunk)
	audio_bytes_io.seek(0)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	for chunk in response.iter_content(chunk_size=1024*1024):
	tmpfile.write(chunk)
	audio_filename = tmpfile.name

	return audio_filename, audio_bytes_io

	def merge_audio_video(video_filename, audio_filename, output_filename):
	print("Merging audio and video ...")
	# Load the video file
	video_clip = VideoFileClip(video_filename)
	# Load the audio file
	audio_clip = AudioFileClip(audio_filename)

	# Determine the shortest duration between audio and video
	min_duration = min(video_clip.duration, audio_clip.duration)

	# Set the audio of the video clip as the audio file, trimming to the shortest duration
	video_clip = video_clip.subclip(0, min_duration)
	audio_clip = audio_clip.subclip(0, min_duration)
	final_clip = video_clip.set_audio(audio_clip)

	# Write the result to a file
	final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")

	# Close the clips
	video_clip.close()
	audio_clip.close()

	return output_filename



	# def merge_audio_video(video_filename, audio_filename, output_filename):
	# print("Merging audio and video ...")
	# video_clip = VideoFileClip(video_filename)
	# audio_clip = AudioFileClip(audio_filename)
	# final_clip = video_clip.set_audio(audio_clip)
	# final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")
	# video_clip.close()
	# audio_clip.close()
	# return output_filename


	def main():
	st.set_page_config(page_title="AI Voiceover", page_icon="🔮")
	st.title("Pixio Video to Voiceover 🎥🔮")

	if not check_password():
	return

	openai_key = os.getenv('OPENAI_API_KEY')
	if not openai_key:
	st.error("OpenAI API key is not set in .env.local")
	return

	uploaded_file = st.file_uploader("Select a video file", type=["mp4", "avi"])

	voice_options = {
	'Echo (Male)': 'echo',
	'Fable (Male)': 'fable',
	'Onyx (Male)': 'onyx',
	'Nova (Female)': 'nova',
	'Shimmer (Female)': 'shimmer',
	'Alloy (Female)': 'alloy'
	}
	option = st.selectbox('Choose the voice you want', list(voice_options.keys()))
	classify = voice_options[option]

	duration_options = list(range(10, 121, 10)) # 10 to 120 seconds, in 10 second intervals
	selected_duration = st.selectbox('Select the desired video duration (seconds)', duration_options)

	# New dropdown for script generator type
	script_type_options = {
	'Product Tutorial': 'Product Tutorial',
	'TikTok': 'TikTok',
	'YouTube Short': 'YouTube Short',
	'Website Tutorial': 'Website Tutorial',
	'General Info': 'General Info'
	}
	selected_script_type = st.selectbox('Choose the script generator type', list(script_type_options.keys()))

	# Incorporating the selected script type and duration into the prompt
	dynamic_prompt_intro = f"Script type: {selected_script_type}. Generate a voiceover script that is approximately {selected_duration} seconds long, tailored to the content and format of a {selected_script_type.lower()}."

	prompt = st.text_area("Edit the voiceover script prompt as needed:", value=dynamic_prompt_intro, height=300)

	if uploaded_file is not None and st.button("START PROCESSING", type="primary"):
	with st.spinner("Video is being processed..."):
	base64Frame, video_filename, video_duration = video_to_frames(uploaded_file, frame_sampling_rate=1)

	if video_duration > selected_duration:
	st.error(f"The video exceeds the selected duration of {selected_duration} seconds.")
	return

	text = frames_to_story(base64Frame, prompt, openai_key)
	st.write(text)

	audio_filename, audio_bytes_io = text_to_audio(text, openai_key, classify)
	output_video_filename = os.path.splitext(video_filename)[0] + "_output.mp4"

	final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
	st.video(final_video_filename)

	os.unlink(video_filename)
	os.unlink(audio_filename)
	os.unlink(final_video_filename)

	if __name__ == "__main__":
	main()