vid2voiceover

Sleeping

File size: 7,301 Bytes

from dotenv import load_dotenv
import streamlit as st
from moviepy.editor import VideoFileClip, AudioFileClip
import cv2
import base64
import io
import openai
import os
import requests
import tempfile

# Load environment variables from .env.local
load_dotenv('.env.local')

def check_password():
    correct_password = os.getenv('PASSWORD')
    if correct_password is None:
        st.error("Password is not set in .env.local")
        return False

    user_password = st.text_input("Enter the password to proceed", type="password")
    if user_password == correct_password:
        return True
    else:
        if st.button("Check Password"):
            st.error("Incorrect password")
        return False

def video_to_frames(video_file, frame_sampling_rate=1):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
        tmpfile.write(video_file.read())
        video_filename = tmpfile.name
    
    video_clip = VideoFileClip(video_filename)
    video_duration = video_clip.duration
    fps = video_clip.fps
    frames_to_skip = int(fps * frame_sampling_rate)

    video = cv2.VideoCapture(video_filename)
    base64Frame = []
    current_frame = 0
    
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        if current_frame % frames_to_skip == 0:
            _, buffer = cv2.imencode('.jpg', frame)
            base64Frame.append(base64.b64encode(buffer).decode("utf-8"))
        current_frame += 1

    video.release()
    print(f"{len(base64Frame)} frames read at a sampling rate of {frame_sampling_rate} second(s) per frame.")
    return base64Frame, video_filename, video_duration

def frames_to_story(base64Frames, prompt, api_key):
    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                prompt,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
            ],
        },
    ]
    params = {
        "model": "gpt-4-vision-preview",
        "messages": PROMPT_MESSAGES,
        "api_key": api_key,
        "headers": {"Openai-Version": "2020-11-07"},
        "max_tokens": 700,
    }
    result = openai.ChatCompletion.create(**params)
    print(result.choices[0].message.content)
    return result.choices[0].message.content

def text_to_audio(text, api_key, voice):
    response = requests.post(
        "https://api.openai.com/v1/audio/speech",
        headers={
            "Authorization": f"Bearer {api_key}",
        },
        json={
            "model": "tts-1",
            "input": text,
            "voice": voice,
        },
    )
    
    if response.status_code != 200:
        raise Exception("Request failed with status code")
    
    audio_bytes_io = io.BytesIO()
    for chunk in response.iter_content(chunk_size=1024*1024):
        audio_bytes_io.write(chunk)
    audio_bytes_io.seek(0)
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
        for chunk in response.iter_content(chunk_size=1024*1024):
            tmpfile.write(chunk)
        audio_filename = tmpfile.name
    
    return audio_filename, audio_bytes_io
    
def merge_audio_video(video_filename, audio_filename, output_filename):
    print("Merging audio and video ...")
    # Load the video file
    video_clip = VideoFileClip(video_filename)
    # Load the audio file
    audio_clip = AudioFileClip(audio_filename)

    # Determine the shortest duration between audio and video
    min_duration = min(video_clip.duration, audio_clip.duration)

    # Set the audio of the video clip as the audio file, trimming to the shortest duration
    video_clip = video_clip.subclip(0, min_duration)
    audio_clip = audio_clip.subclip(0, min_duration)
    final_clip = video_clip.set_audio(audio_clip)

    # Write the result to a file
    final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")

    # Close the clips
    video_clip.close()
    audio_clip.close()

    return output_filename

    
    
# def merge_audio_video(video_filename, audio_filename, output_filename):
#     print("Merging audio and video ...")
#     video_clip = VideoFileClip(video_filename)
#     audio_clip = AudioFileClip(audio_filename)
#     final_clip = video_clip.set_audio(audio_clip)
#     final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac")
#     video_clip.close()
#     audio_clip.close()
#     return output_filename


def main():
    st.set_page_config(page_title="AI Voiceover", page_icon="🔮")
    st.title("Pixio Video to Voiceover 🎥🔮")

    if not check_password():
        return
    
    openai_key = os.getenv('OPENAI_API_KEY')
    if not openai_key:
        st.error("OpenAI API key is not set in .env.local")
        return
    
    uploaded_file = st.file_uploader("Select a video file", type=["mp4", "avi"])
    
    voice_options = {
        'Echo (Male)': 'echo',
        'Fable (Male)': 'fable',
        'Onyx (Male)': 'onyx',
        'Nova (Female)': 'nova',
        'Shimmer (Female)': 'shimmer',
        'Alloy (Female)': 'alloy'
    }
    option = st.selectbox('Choose the voice you want', list(voice_options.keys()))
    classify = voice_options[option]

    duration_options = list(range(10, 121, 10))  # 10 to 120 seconds, in 10 second intervals
    selected_duration = st.selectbox('Select the desired video duration (seconds)', duration_options)

    # New dropdown for script generator type
    script_type_options = {
        'Product Tutorial': 'Product Tutorial',
        'TikTok': 'TikTok',
        'YouTube Short': 'YouTube Short',
        'Website Tutorial': 'Website Tutorial',
        'General Info': 'General Info'
    }
    selected_script_type = st.selectbox('Choose the script generator type', list(script_type_options.keys()))

    # Incorporating the selected script type and duration into the prompt
    dynamic_prompt_intro = f"Script type: {selected_script_type}. Generate a voiceover script that is approximately {selected_duration} seconds long, tailored to the content and format of a {selected_script_type.lower()}."
    
    prompt = st.text_area("Edit the voiceover script prompt as needed:", value=dynamic_prompt_intro, height=300)

    if uploaded_file is not None and st.button("START PROCESSING", type="primary"):
        with st.spinner("Video is being processed..."):
            base64Frame, video_filename, video_duration = video_to_frames(uploaded_file, frame_sampling_rate=1)
            
            if video_duration > selected_duration:
                st.error(f"The video exceeds the selected duration of {selected_duration} seconds.")
                return
            
            text = frames_to_story(base64Frame, prompt, openai_key)
            st.write(text)
            
            audio_filename, audio_bytes_io = text_to_audio(text, openai_key, classify)
            output_video_filename = os.path.splitext(video_filename)[0] + "_output.mp4"
            
            final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
            st.video(final_video_filename)
            
            os.unlink(video_filename)
            os.unlink(audio_filename)
            os.unlink(final_video_filename)

if __name__ == "__main__":
    main()