Spaces:

sheikhed
/

json

Sleeping

App Files Files Community

sheikhed commited on Oct 11, 2024

Commit

eed4dc6

verified ·

1 Parent(s): a7c9b9d

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -21

app.py CHANGED Viewed

@@ -61,6 +61,35 @@ def text_to_speech(voice_id, text, session_id):
         audio_file.write(response.content)
     return audio_file_path
 def upload_file(file_path):
     with open(file_path, 'rb') as file:
         files = {'fileToUpload': (os.path.basename(file_path), file)}
@@ -92,7 +121,7 @@ def lipsync_api_call(video_url, audio_url):
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
-    max_attempts = 30  # Limit the number of attempts
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
@@ -107,31 +136,27 @@ def check_job_status(job_id):
     return None
 def get_media_duration(file_path):
-    # Fetch media duration using ffprobe
     cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return float(result.stdout.strip())
 def combine_audio_video(video_path, audio_path, output_path):
-    # Get durations of both video and audio
     video_duration = get_media_duration(video_path)
     audio_duration = get_media_duration(audio_path)
     if video_duration > audio_duration:
-        # Trim video to match the audio length
         cmd = [
             'ffmpeg', '-i', video_path, '-i', audio_path,
-            '-t', str(audio_duration),  # Trim video to audio duration
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-y', output_path
         ]
     else:
-        # Loop video if it's shorter than audio
-        loop_count = int(audio_duration // video_duration) + 1  # Calculate how many times to loop
         cmd = [
             'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
-            '-t', str(audio_duration),  # Match the duration of the final video with the audio
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-shortest', '-y', output_path
@@ -139,12 +164,20 @@ def combine_audio_video(video_path, audio_path, output_path):
     subprocess.run(cmd, check=True)
-def process_video(voice, model, text, progress=gr.Progress()):
-    session_id = str(uuid.uuid4())  # Generate a unique session ID
-    progress(0, desc="Generating speech...")
-    audio_path = text_to_speech(voice, text, session_id)
-    if not audio_path:
-        return None, "Failed to generate speech audio."
     progress(0.2, desc="Processing video...")
     video_path = os.path.join("models", model)
@@ -201,23 +234,40 @@ def create_interface():
         gr.Markdown("# JSON Train")
         with gr.Row():
             with gr.Column():
-                voice_dropdown = gr.Dropdown(choices=[v[0] for v in voices], label="Select", value=voices[0][0] if voices else None)
-                model_dropdown = gr.Dropdown(choices=models, label="Select", value=models[0] if models else None)
-                text_input = gr.Textbox(label="Enter text", lines=3)
                 generate_btn = gr.Button("Generate Video")
             with gr.Column():
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
-        def on_generate(voice_name, model_name, text):
             voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
             if not voice_id:
                 return None, "Invalid voice selected."
-            return process_video(voice_id, model_name, text)
         generate_btn.click(
             fn=on_generate,
-            inputs=[voice_dropdown, model_dropdown, text_input],
             outputs=[video_output, status_output]
         )
@@ -225,4 +275,4 @@ def create_interface():
 if __name__ == "__main__":
     app = create_interface()
-    app.launch()

         audio_file.write(response.content)
     return audio_file_path
+def process_uploaded_audio(audio_file, session_id):
+    """Process and validate uploaded audio file"""
+    if audio_file is None:
+        return None
+    # Get the file extension
+    ext = os.path.splitext(audio_file.name)[1].lower()
+    if ext not in ['.mp3', '.wav', '.m4a', '.aac']:
+        return None
+    # Save the uploaded file with session ID
+    audio_file_path = f'temp_voice_{session_id}{ext}'
+    with open(audio_file_path, 'wb') as f:
+        f.write(audio_file.read())
+    # Convert to mp3 if not already mp3
+    if ext != '.mp3':
+        mp3_path = f'temp_voice_{session_id}.mp3'
+        cmd = [
+            'ffmpeg', '-i', audio_file_path,
+            '-codec:a', 'libmp3lame', '-qscale:a', '2',
+            '-y', mp3_path
+        ]
+        subprocess.run(cmd, check=True)
+        os.remove(audio_file_path)
+        return mp3_path
+    return audio_file_path
 def upload_file(file_path):
     with open(file_path, 'rb') as file:
         files = {'fileToUpload': (os.path.basename(file_path), file)}
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
+    max_attempts = 30
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
     return None
 def get_media_duration(file_path):
     cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return float(result.stdout.strip())
 def combine_audio_video(video_path, audio_path, output_path):
     video_duration = get_media_duration(video_path)
     audio_duration = get_media_duration(audio_path)
     if video_duration > audio_duration:
         cmd = [
             'ffmpeg', '-i', video_path, '-i', audio_path,
+            '-t', str(audio_duration),
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-y', output_path
         ]
     else:
+        loop_count = int(audio_duration // video_duration) + 1
         cmd = [
             'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
+            '-t', str(audio_duration),
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-shortest', '-y', output_path
     subprocess.run(cmd, check=True)
+def process_video(voice, model, text, audio_file, progress=gr.Progress()):
+    session_id = str(uuid.uuid4())
+    # Handle audio input (either text-to-speech or uploaded file)
+    if audio_file is not None:
+        progress(0.1, desc="Processing uploaded audio...")
+        audio_path = process_uploaded_audio(audio_file, session_id)
+        if not audio_path:
+            return None, "Failed to process uploaded audio file."
+    else:
+        progress(0.1, desc="Generating speech...")
+        audio_path = text_to_speech(voice, text, session_id)
+        if not audio_path:
+            return None, "Failed to generate speech audio."
     progress(0.2, desc="Processing video...")
     video_path = os.path.join("models", model)
         gr.Markdown("# JSON Train")
         with gr.Row():
             with gr.Column():
+                with gr.Tab("Text to Speech"):
+                    voice_dropdown = gr.Dropdown(
+                        choices=[v[0] for v in voices],
+                        label="Select Voice",
+                        value=voices[0][0] if voices else None
+                    )
+                    text_input = gr.Textbox(label="Enter text", lines=3)
+                with gr.Tab("Upload Audio"):
+                    audio_input = gr.File(
+                        label="Upload Audio File",
+                        file_types=["audio/*"]
+                    )
+                model_dropdown = gr.Dropdown(
+                    choices=models,
+                    label="Select Video Model",
+                    value=models[0] if models else None
+                )
                 generate_btn = gr.Button("Generate Video")
             with gr.Column():
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
+        def on_generate(voice_name, model_name, text, audio_file):
             voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
             if not voice_id:
                 return None, "Invalid voice selected."
+            return process_video(voice_id, model_name, text, audio_file)
         generate_btn.click(
             fn=on_generate,
+            inputs=[voice_dropdown, model_dropdown, text_input, audio_input],
             outputs=[video_output, status_output]
         )
 if __name__ == "__main__":
     app = create_interface()
+    app.launch()