vid2voiceover / app.py
tsi-org's picture
Update app.py
b22f90d verified
raw
history blame
7.3 kB
from dotenv import load_dotenv
import streamlit as st
from moviepy.editor import VideoFileClip, AudioFileClip
import cv2
import base64
import io
import openai
import os
import requests
import tempfile
# Load environment variables from .env.local
load_dotenv('.env.local')
def check_password():
correct_password = os.getenv('PASSWORD')
if correct_password is None:
st.error("Password is not set in .env.local")
return False
user_password = st.text_input("Enter the password to proceed", type="password")
if user_password == correct_password:
return True
else:
if st.button("Check Password"):
st.error("Incorrect password")
return False
def video_to_frames(video_file, frame_sampling_rate=1):
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
tmpfile.write(video_file.read())
video_filename = tmpfile.name
video_clip = VideoFileClip(video_filename)
video_duration = video_clip.duration
fps = video_clip.fps
frames_to_skip = int(fps * frame_sampling_rate)
video = cv2.VideoCapture(video_filename)
base64Frame = []
current_frame = 0
while video.isOpened():
success, frame = video.read()
if not success:
break
if current_frame % frames_to_skip == 0:
_, buffer = cv2.imencode('.jpg', frame)
base64Frame.append(base64.b64encode(buffer).decode("utf-8"))
current_frame += 1
video.release()
print(f"{len(base64Frame)} frames read at a sampling rate of {frame_sampling_rate} second(s) per frame.")
return base64Frame, video_filename, video_duration
def frames_to_story(base64Frames, prompt, api_key):
PROMPT_MESSAGES = [
{
"role": "user",
"content": [
prompt,
*map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
],
},
]
params = {
"model": "gpt-4-vision-preview",
"messages": PROMPT_MESSAGES,
"api_key": api_key,
"headers": {"Openai-Version": "2020-11-07"},
"max_tokens": 700,
}
result = openai.ChatCompletion.create(**params)
print(result.choices[0].message.content)
return result.choices[0].message.content
def text_to_audio(text, api_key, voice):
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={
"Authorization": f"Bearer {api_key}",
},
json={
"model": "tts-1",
"input": text,
"voice": voice,
},
)
if response.status_code != 200:
raise Exception("Request failed with status code")
audio_bytes_io = io.BytesIO()
for chunk in response.iter_content(chunk_size=1024*1024):
audio_bytes_io.write(chunk)
audio_bytes_io.seek(0)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
for chunk in response.iter_content(chunk_size=1024*1024):
tmpfile.write(chunk)
audio_filename = tmpfile.name
return audio_filename, audio_bytes_io
def merge_audio_video(video_filename, audio_filename, output_filename):
print("Merging audio and video ...")
# Load the video file
video_clip = VideoFileClip(video_filename)
# Load the audio file
audio_clip = AudioFileClip(audio_filename)
# Determine the shortest duration between audio and video
min_duration = min(video_clip.duration, audio_clip.duration)
# Set the audio of the video clip as the audio file, trimming to the shortest duration
video_clip = video_clip.subclip(0, min_duration)
audio_clip = audio_clip.subclip(0, min_duration)
final_clip = video_clip.set_audio(audio_clip)
# Write the result to a file
final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")
# Close the clips
video_clip.close()
audio_clip.close()
return output_filename
# def merge_audio_video(video_filename, audio_filename, output_filename):
# print("Merging audio and video ...")
# video_clip = VideoFileClip(video_filename)
# audio_clip = AudioFileClip(audio_filename)
# final_clip = video_clip.set_audio(audio_clip)
# final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")
# video_clip.close()
# audio_clip.close()
# return output_filename
def main():
st.set_page_config(page_title="AI Voiceover", page_icon="🔮")
st.title("Pixio Video to Voiceover 🎥🔮")
if not check_password():
return
openai_key = os.getenv('OPENAI_API_KEY')
if not openai_key:
st.error("OpenAI API key is not set in .env.local")
return
uploaded_file = st.file_uploader("Select a video file", type=["mp4", "avi"])
voice_options = {
'Echo (Male)': 'echo',
'Fable (Male)': 'fable',
'Onyx (Male)': 'onyx',
'Nova (Female)': 'nova',
'Shimmer (Female)': 'shimmer',
'Alloy (Female)': 'alloy'
}
option = st.selectbox('Choose the voice you want', list(voice_options.keys()))
classify = voice_options[option]
duration_options = list(range(10, 121, 10)) # 10 to 120 seconds, in 10 second intervals
selected_duration = st.selectbox('Select the desired video duration (seconds)', duration_options)
# New dropdown for script generator type
script_type_options = {
'Product Tutorial': 'Product Tutorial',
'TikTok': 'TikTok',
'YouTube Short': 'YouTube Short',
'Website Tutorial': 'Website Tutorial',
'General Info': 'General Info'
}
selected_script_type = st.selectbox('Choose the script generator type', list(script_type_options.keys()))
# Incorporating the selected script type and duration into the prompt
dynamic_prompt_intro = f"Script type: {selected_script_type}. Generate a voiceover script that is approximately {selected_duration} seconds long, tailored to the content and format of a {selected_script_type.lower()}."
prompt = st.text_area("Edit the voiceover script prompt as needed:", value=dynamic_prompt_intro, height=300)
if uploaded_file is not None and st.button("START PROCESSING", type="primary"):
with st.spinner("Video is being processed..."):
base64Frame, video_filename, video_duration = video_to_frames(uploaded_file, frame_sampling_rate=1)
if video_duration > selected_duration:
st.error(f"The video exceeds the selected duration of {selected_duration} seconds.")
return
text = frames_to_story(base64Frame, prompt, openai_key)
st.write(text)
audio_filename, audio_bytes_io = text_to_audio(text, openai_key, classify)
output_video_filename = os.path.splitext(video_filename)[0] + "_output.mp4"
final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
st.video(final_video_filename)
os.unlink(video_filename)
os.unlink(audio_filename)
os.unlink(final_video_filename)
if __name__ == "__main__":
main()