broadfield-dev commited on
Commit
a74e608
·
verified ·
1 Parent(s): 5181a56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -27
app.py CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import subprocess
3
  import os
@@ -5,56 +15,139 @@ import shutil
5
  import uuid
6
  from transformers import pipeline
7
  from gtts import gTTS
 
8
 
 
 
9
 
10
- def translate_video(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  try:
13
- audio_path = os.path.join(file_path, "audio.wav")
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  if not os.path.exists(audio_path):
16
- raise FileNotFoundError("Audio extraction failed. yt-dlp did not produce a .wav file.")
17
 
18
- # 3. Translate the audio using the whisper-tiny model
19
- translator = pipeline(
20
- "automatic-speech-recognition",
21
- model="openai/whisper-tiny",
22
- device="cpu"
 
23
  )
24
-
25
- translation = translator(audio_path, return_timestamps=True, generate_kwargs={"task": "translate"})
26
-
27
- translated_text = translation["text"]
 
28
 
29
- if not translated_text:
30
- return "No speech was detected in the video.", None, video_path
 
 
 
 
 
 
31
 
32
- # 4. Convert translated text to speech using gTTS
33
- tts = gTTS(translated_text.strip(), lang='en')
 
 
 
 
 
 
 
 
 
 
34
  translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
35
  tts.save(translated_audio_path)
36
 
37
- return translated_text, translated_audio_path, video_path
 
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
 
39
  except Exception as e:
40
- gr.Warning(f"An unexpected error occurred: {str(e)}")
41
- return f"An error occurred: {str(e)}", None, None
 
 
 
 
 
 
42
 
43
- # Create the Gradio interface
44
  iface = gr.Interface(
45
  fn=translate_video,
46
- inputs=gr.Video(label="Upload your video to translate"),
47
  outputs=[
48
- gr.Textbox(label="Translated Text", interactive=False),
49
- gr.Audio(label="Translated Audio"),
 
 
50
  gr.Video(label="Original Video"),
51
  ],
52
- title="Twitter/X Video Translator",
53
- description="Enter a link to a Twitter/X video to translate its audio to English. Handles videos longer than 30 seconds.",
54
  allow_flagging="never",
 
 
 
 
 
55
  )
56
 
57
  if __name__ == "__main__":
58
- if not os.path.exists("downloads"):
59
- os.makedirs("downloads")
60
  iface.launch()
 
1
+ #
2
+ # ----- Prerequisites -----
3
+ # 1. Install required Python libraries:
4
+ # pip install gradio transformers torch gtts langdetect
5
+ #
6
+ # 2. Install ffmpeg on your system.
7
+ # - (Mac) brew install ffmpeg
8
+ # - (Ubuntu) sudo apt install ffmpeg
9
+ # - (Windows) choco install ffmpeg
10
+ #
11
  import gradio as gr
12
  import subprocess
13
  import os
 
15
  import uuid
16
  from transformers import pipeline
17
  from gtts import gTTS
18
+ from langdetect import detect, DetectorFactory
19
 
20
+ # Ensure deterministic language detection results
21
+ DetectorFactory.seed = 0
22
 
23
+ # --- 1. Load the model only once ---
24
+ # This is more efficient as it won't reload the model on every function call.
25
+ print("Loading Whisper model, this may take a moment...")
26
+ try:
27
+ asr_pipeline = pipeline(
28
+ "automatic-speech-recognition",
29
+ model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
30
+ device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
31
+ )
32
+ print("Whisper model loaded successfully.")
33
+ except Exception as e:
34
+ print(f"Error loading model: {e}")
35
+ # Exit or handle the error appropriately if the model is critical
36
+ asr_pipeline = None
37
+
38
+ def translate_video(video_path):
39
+ """
40
+ Translates the audio of a video file to English and provides detailed output.
41
+ """
42
+ if not asr_pipeline:
43
+ gr.Warning("The speech recognition model is not available. The application cannot proceed.")
44
+ return "Model not loaded.", None, None, None, None
45
+
46
+ # Create a unique temporary directory for this run
47
+ temp_dir = f"temp_{uuid.uuid4()}"
48
+ os.makedirs(temp_dir, exist_ok=True)
49
 
50
  try:
51
+ gr.Info("Step 1/5: Extracting audio from video...")
52
+ audio_path = os.path.join(temp_dir, "audio.wav")
53
+
54
+ # Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
55
+ # -vn disables video recording. -acodec pcm_s16le is standard for .wav
56
+ # -ar 16000 is the sample rate Whisper expects.
57
+ command = [
58
+ "ffmpeg", "-i", video_path, "-y",
59
+ "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
60
+ audio_path
61
+ ]
62
+ subprocess.run(command, check=True, capture_output=True, text=True)
63
 
64
  if not os.path.exists(audio_path):
65
+ raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")
66
 
67
+ # --- 2. Transcribe the original audio to text ---
68
+ gr.Info("Step 2/5: Transcribing original audio...")
69
+ transcription_result = asr_pipeline(
70
+ audio_path,
71
+ return_timestamps=False, # We don't need timestamps for the full transcript
72
+ generate_kwargs={"task": "transcribe"}
73
  )
74
+ original_transcript = transcription_result["text"].strip()
75
+
76
+ if not original_transcript:
77
+ gr.Warning("No speech was detected in the video.")
78
+ return "No speech detected.", "N/A", "N/A", None, video_path
79
 
80
+ # --- 3. Detect the language of the original transcript ---
81
+ gr.Info("Step 3/5: Detecting language...")
82
+ try:
83
+ detected_language_code = detect(original_transcript)
84
+ # You can expand this with a dictionary for full language names if desired
85
+ # e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
86
+ except Exception:
87
+ detected_language_code = "Unknown"
88
 
89
+ # --- 4. Translate the audio into English ---
90
+ gr.Info("Step 4/5: Translating audio to English...")
91
+ translation_result = asr_pipeline(
92
+ audio_path,
93
+ return_timestamps=False,
94
+ generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
95
+ )
96
+ translated_text = translation_result["text"].strip()
97
+
98
+ # --- 5. Convert translated text to speech ---
99
+ gr.Info("Step 5/5: Generating translated audio...")
100
+ tts = gTTS(translated_text, lang='en')
101
  translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
102
  tts.save(translated_audio_path)
103
 
104
+ # Create a detailed summary markdown
105
+ summary_markdown = f"""
106
+ ## Translation Details
107
+ - **Detected Language**: `{detected_language_code}`
108
+
109
+ ---
110
+
111
+ ### Translated Text (English)
112
+ {translated_text}
113
+ """
114
+
115
+ return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path
116
 
117
+ except subprocess.CalledProcessError as e:
118
+ error_message = f"ffmpeg error: {e.stderr}"
119
+ gr.Warning(error_message)
120
+ return error_message, None, None, None, None
121
  except Exception as e:
122
+ error_message = f"An unexpected error occurred: {str(e)}"
123
+ gr.Warning(error_message)
124
+ return error_message, None, None, None, None
125
+ finally:
126
+ # Clean up the temporary directory
127
+ if os.path.exists(temp_dir):
128
+ shutil.rmtree(temp_dir)
129
+
130
 
131
+ # --- Create the Gradio interface ---
132
  iface = gr.Interface(
133
  fn=translate_video,
134
+ inputs=gr.Video(label="Upload Your Video", sources=['upload']),
135
  outputs=[
136
+ gr.Markdown(label="Summary"),
137
+ gr.Textbox(label="Original Transcript", interactive=False, lines=5),
138
+ gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
139
+ gr.Audio(label="Translated Audio (English)"),
140
  gr.Video(label="Original Video"),
141
  ],
142
+ title="Enhanced Video Translator",
143
+ description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
144
  allow_flagging="never",
145
+ examples=[
146
+ # You can place video files in a folder named 'examples' next to your script
147
+ # and they will show up here.
148
+ # [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
149
+ ]
150
  )
151
 
152
  if __name__ == "__main__":
 
 
153
  iface.launch()