transcribe_audio

Running

App Files Files Community

cstr commited on Oct 2, 2024

Commit

4347dae

verified ·

1 Parent(s): 73225e0

g1

Browse files

Files changed (1) hide show

app.py +32 -31

app.py CHANGED Viewed

@@ -95,37 +95,42 @@ def download_youtube_audio(url, method_choice):
 def yt_dlp_method(url):
     """
-    Downloads YouTube audio using yt-dlp.
     Args:
         url (str): The YouTube URL.
     Returns:
         str: Path to the downloaded audio file.
     """
     logging.info("Using yt-dlp method")
     ydl_opts = {
         'format': 'bestaudio/best',
         'postprocessors': [{
             'key': 'FFmpegExtractAudio',
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
-        'outtmpl': '%(id)s.%(ext)s',
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         info = ydl.extract_info(url, download=True)
-        output_file = f"{info['id']}.mp3"
         logging.info(f"Downloaded YouTube audio: {output_file}")
         return output_file
 def pytube_method(url):
     """
-    Downloads audio using pytube.
     Args:
         url (str): The YouTube URL.
     Returns:
         str: Path to the downloaded audio file.
     """
@@ -133,7 +138,8 @@ def pytube_method(url):
     from pytube import YouTube
     yt = YouTube(url)
     audio_stream = yt.streams.filter(only_audio=True).first()
-    out_file = audio_stream.download()
     base, ext = os.path.splitext(out_file)
     new_file = base + '.mp3'
     os.rename(out_file, new_file)
@@ -396,10 +402,10 @@ loaded_models = {}
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     """
-    Transcribes audio from a given source using the specified pipeline and model.
     Args:
-        input_source (str or file): URL of audio, path to local file, or uploaded file object.
         pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
         model_id (str): The ID of the model to use.
         dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
@@ -424,32 +430,22 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         if verbose:
             yield verbose_messages, "", None
-        # Determine if input_source is a URL or file
-        audio_path = None
-        is_temp_file = False
-        if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
-            # Input source is a URL
-            audio_path, is_temp_file = download_audio(input_source, download_method)
-            if not audio_path or audio_path.startswith("Error"):
-                yield f"Error downloading audio: {audio_path}", "", None
-                return
-        elif isinstance(input_source, str) and os.path.exists(input_source):
-            # Input source is a local file path
-            audio_path = input_source
-            is_temp_file = False
-        elif hasattr(input_source, 'name'):
-            # Input source is an uploaded file object
-            audio_path = input_source.name
-            is_temp_file = False
-        else:
-            yield "No valid audio source provided.", "", None
             return
         # Convert start_time and end_time to float or None
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
         if start_time is not None or end_time is not None:
             audio_path = trim_audio(audio_path, start_time, end_time)
             is_temp_file = True  # The trimmed audio is a temporary file
@@ -463,6 +459,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             model_or_pipeline = loaded_models[model_key]
             logging.info("Loaded model from cache")
         else:
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
                 model_or_pipeline = BatchedInferencePipeline(model=model)
@@ -495,6 +492,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
                 raise ValueError("Invalid pipeline type")
             loaded_models[model_key] = model_or_pipeline  # Cache the model or pipeline
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
             segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
@@ -505,6 +503,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             segments = result["chunks"]
         end_time_perf = time.time()
         transcription_time = end_time_perf - start_time_perf
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
@@ -516,6 +515,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         if verbose:
             yield verbose_messages + metrics_output, "", None
         transcription = ""
         for segment in segments:
@@ -527,6 +527,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
         transcription_file = save_transcription(transcription)
         yield verbose_messages + metrics_output, transcription, transcription_file

 def yt_dlp_method(url):
     """
+    Downloads YouTube audio using yt-dlp and saves it to a temporary file.
     Args:
         url (str): The YouTube URL.
     Returns:
         str: Path to the downloaded audio file.
     """
     logging.info("Using yt-dlp method")
+    temp_dir = tempfile.mkdtemp()
+    output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
     ydl_opts = {
         'format': 'bestaudio/best',
+        'outtmpl': output_template,
         'postprocessors': [{
             'key': 'FFmpegExtractAudio',
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
+        'quiet': True,
+        'no_warnings': True,
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         info = ydl.extract_info(url, download=True)
+        output_file = ydl.prepare_filename(info)
+        output_file = os.path.splitext(output_file)[0] + '.mp3'
         logging.info(f"Downloaded YouTube audio: {output_file}")
         return output_file
 def pytube_method(url):
     """
+    Downloads audio from a YouTube URL using pytube and saves it to a temporary file.
     Args:
         url (str): The YouTube URL.
     Returns:
         str: Path to the downloaded audio file.
     """
     from pytube import YouTube
     yt = YouTube(url)
     audio_stream = yt.streams.filter(only_audio=True).first()
+    temp_dir = tempfile.mkdtemp()
+    out_file = audio_stream.download(output_path=temp_dir)
     base, ext = os.path.splitext(out_file)
     new_file = base + '.mp3'
     os.rename(out_file, new_file)
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     """
+    Transcribes audio from a given URL using the specified pipeline and model.
     Args:
+        input_source (str): URL of the audio.
         pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
         model_id (str): The ID of the model to use.
         dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
         if verbose:
             yield verbose_messages, "", None
+        # Input source is expected to be a URL
+        if not input_source or not input_source.strip():
+            yield "No audio URL provided.", "", None
+            return
+        # Download the audio from the URL
+        audio_path, is_temp_file = download_audio(input_source, download_method)
+        if not audio_path or audio_path.startswith("Error"):
+            yield f"Error downloading audio: {audio_path}", "", None
             return
         # Convert start_time and end_time to float or None
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
+        # Trim the audio if start or end times are provided
         if start_time is not None or end_time is not None:
             audio_path = trim_audio(audio_path, start_time, end_time)
             is_temp_file = True  # The trimmed audio is a temporary file
             model_or_pipeline = loaded_models[model_key]
             logging.info("Loaded model from cache")
         else:
+            # Load the appropriate model or pipeline based on the pipeline type
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
                 model_or_pipeline = BatchedInferencePipeline(model=model)
                 raise ValueError("Invalid pipeline type")
             loaded_models[model_key] = model_or_pipeline  # Cache the model or pipeline
+        # Perform the transcription
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
             segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
             segments = result["chunks"]
         end_time_perf = time.time()
+        # Calculate metrics
         transcription_time = end_time_perf - start_time_perf
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
         if verbose:
             yield verbose_messages + metrics_output, "", None
+        # Compile the transcription text
         transcription = ""
         for segment in segments:
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
+        # Save the transcription to a file
         transcription_file = save_transcription(transcription)
         yield verbose_messages + metrics_output, transcription, transcription_file