Spaces:

broadfield-dev
/

Video-Translate

Sleeping

App Files Files Community

broadfield-dev commited on Jun 19

Commit

a74e608

verified ·

1 Parent(s): 5181a56

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -27

app.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import gradio as gr
 import subprocess
 import os
@@ -5,56 +15,139 @@ import shutil
 import uuid
 from transformers import pipeline
 from gtts import gTTS
-def translate_video(file_path):
     try:
-        audio_path = os.path.join(file_path, "audio.wav")
         if not os.path.exists(audio_path):
-             raise FileNotFoundError("Audio extraction failed. yt-dlp did not produce a .wav file.")
-        # 3. Translate the audio using the whisper-tiny model
-        translator = pipeline(
-            "automatic-speech-recognition",
-            model="openai/whisper-tiny",
-            device="cpu"
         )
-        translation = translator(audio_path, return_timestamps=True, generate_kwargs={"task": "translate"})
-        translated_text = translation["text"]
-        if not translated_text:
-            return "No speech was detected in the video.", None, video_path
-        # 4. Convert translated text to speech using gTTS
-        tts = gTTS(translated_text.strip(), lang='en')
         translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
         tts.save(translated_audio_path)
-        return translated_text, translated_audio_path, video_path
     except Exception as e:
-        gr.Warning(f"An unexpected error occurred: {str(e)}")
-        return f"An error occurred: {str(e)}", None, None
-# Create the Gradio interface
 iface = gr.Interface(
     fn=translate_video,
-    inputs=gr.Video(label="Upload your video to translate"),
     outputs=[
-        gr.Textbox(label="Translated Text", interactive=False),
-        gr.Audio(label="Translated Audio"),
         gr.Video(label="Original Video"),
     ],
-    title="Twitter/X Video Translator",
-    description="Enter a link to a Twitter/X video to translate its audio to English. Handles videos longer than 30 seconds.",
     allow_flagging="never",
 )
 if __name__ == "__main__":
-    if not os.path.exists("downloads"):
-        os.makedirs("downloads")
     iface.launch()

+#
+# ----- Prerequisites -----
+# 1. Install required Python libraries:
+#    pip install gradio transformers torch gtts langdetect
+#
+# 2. Install ffmpeg on your system.
+#    - (Mac)     brew install ffmpeg
+#    - (Ubuntu)  sudo apt install ffmpeg
+#    - (Windows) choco install ffmpeg
+#
 import gradio as gr
 import subprocess
 import os
 import uuid
 from transformers import pipeline
 from gtts import gTTS
+from langdetect import detect, DetectorFactory
+# Ensure deterministic language detection results
+DetectorFactory.seed = 0
+# --- 1. Load the model only once ---
+# This is more efficient as it won't reload the model on every function call.
+print("Loading Whisper model, this may take a moment...")
+try:
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
+        device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
+    )
+    print("Whisper model loaded successfully.")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    # Exit or handle the error appropriately if the model is critical
+    asr_pipeline = None
+def translate_video(video_path):
+    """
+    Translates the audio of a video file to English and provides detailed output.
+    """
+    if not asr_pipeline:
+        gr.Warning("The speech recognition model is not available. The application cannot proceed.")
+        return "Model not loaded.", None, None, None, None
+    # Create a unique temporary directory for this run
+    temp_dir = f"temp_{uuid.uuid4()}"
+    os.makedirs(temp_dir, exist_ok=True)
     try:
+        gr.Info("Step 1/5: Extracting audio from video...")
+        audio_path = os.path.join(temp_dir, "audio.wav")
+        # Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
+        # -vn disables video recording. -acodec pcm_s16le is standard for .wav
+        # -ar 16000 is the sample rate Whisper expects.
+        command = [
+            "ffmpeg", "-i", video_path, "-y",
+            "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
+            audio_path
+        ]
+        subprocess.run(command, check=True, capture_output=True, text=True)
         if not os.path.exists(audio_path):
+            raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")
+        # --- 2. Transcribe the original audio to text ---
+        gr.Info("Step 2/5: Transcribing original audio...")
+        transcription_result = asr_pipeline(
+            audio_path,
+            return_timestamps=False, # We don't need timestamps for the full transcript
+            generate_kwargs={"task": "transcribe"}
         )
+        original_transcript = transcription_result["text"].strip()
+        if not original_transcript:
+            gr.Warning("No speech was detected in the video.")
+            return "No speech detected.", "N/A", "N/A", None, video_path
+        # --- 3. Detect the language of the original transcript ---
+        gr.Info("Step 3/5: Detecting language...")
+        try:
+            detected_language_code = detect(original_transcript)
+            # You can expand this with a dictionary for full language names if desired
+            # e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
+        except Exception:
+            detected_language_code = "Unknown"
+        # --- 4. Translate the audio into English ---
+        gr.Info("Step 4/5: Translating audio to English...")
+        translation_result = asr_pipeline(
+            audio_path,
+            return_timestamps=False,
+            generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
+        )
+        translated_text = translation_result["text"].strip()
+        # --- 5. Convert translated text to speech ---
+        gr.Info("Step 5/5: Generating translated audio...")
+        tts = gTTS(translated_text, lang='en')
         translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
         tts.save(translated_audio_path)
+        # Create a detailed summary markdown
+        summary_markdown = f"""
+        ## Translation Details
+        - **Detected Language**: `{detected_language_code}`
+        ---
+        ### Translated Text (English)
+        {translated_text}
+        """
+        return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path
+    except subprocess.CalledProcessError as e:
+        error_message = f"ffmpeg error: {e.stderr}"
+        gr.Warning(error_message)
+        return error_message, None, None, None, None
     except Exception as e:
+        error_message = f"An unexpected error occurred: {str(e)}"
+        gr.Warning(error_message)
+        return error_message, None, None, None, None
+    finally:
+        # Clean up the temporary directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+# --- Create the Gradio interface ---
 iface = gr.Interface(
     fn=translate_video,
+    inputs=gr.Video(label="Upload Your Video", sources=['upload']),
     outputs=[
+        gr.Markdown(label="Summary"),
+        gr.Textbox(label="Original Transcript", interactive=False, lines=5),
+        gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
+        gr.Audio(label="Translated Audio (English)"),
         gr.Video(label="Original Video"),
     ],
+    title="Enhanced Video Translator",
+    description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
     allow_flagging="never",
+    examples=[
+        # You can place video files in a folder named 'examples' next to your script
+        # and they will show up here.
+        # [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
+    ]
 )
 if __name__ == "__main__":
     iface.launch()